def remove_aberrant_reads(self, threads=1): if not os.path.exists(self.raw_bam): raise Exception("raw BAM file does not exist") job = jr.JobRunner() tmp_bam = "%s/%s.tmp.bam" %(self.output_dir, self.prefix) tmp_clean_bam = "%s/%s.tmp.clean.bam" %(self.output_dir, self.prefix) if self.is_paired: # Filter unmapped and low quality (MAPQ<20) reads command = "samtools view -@ %s -F1804 -f2 -q 20 -h" %(threads)\ + "-u %s | " %(self.raw_bam)\ + "samtools sort -@ %s -o %s -n -" %(threads, tmp_bam) job.append([[ command ]]) job.run() # Filter orphan reads (pair was removed), and read pairs # mapping to different chromosomes command = "samtools fixmate -r %s %s" %(tmp_bam, tmp_clean_bam) job.append([[ command ]]) job.run() command = "samtools view -@ %s -F1804 -f2 -u %s | " %(threads, tmp_clean_bam)\ + "samtools sort -@ %s -o %s -" %(threads, self.flt_bam) job.append([[ command ]]) job.run() job.append([[ "rm %s %s" %(tmp_bam, tmp_clean_bam) ]]) job.run() else: # Filter unmapped and low quality (MAPQ<20) reads command = "samtools view -@ %s -F1804 -q 20 " %(threads)\ + "-u %s -o %s" %(self.raw_bam, self.flt_bam) job.append([[ command ]]) job.run()
def peak_oracle(inputs): insert = 0 if inputs['fraglen'] > 0: insert = inputs['fraglen'] else: job = jr.JobRunner() for prefix in inputs['prefix_inputs']: if inputs['spp']: job.append([[ "cat %s/%s/bwa_out/*.cc.qc | cut -f3" % (inputs['outdir'], prefix) ]]) insert = insert + int(job.run()[0].rstrip()) else: job.append([[ "grep -A2 \"## METRICS CLASS\" %s/%s/bwa_out/InsertSizeMetrics.txt | tail -1 | cut -f1" % (inputs['outdir'], prefix) ]]) insert = insert + int(job.run()[0].rstrip()) insert = insert / len(inputs['prefix_inputs']) ### PEAK run_peak = pc.Macs2PeakCaller( "%s/oracle_peaks/pooled_inputs.bed.gz" % (inputs['outdir']), "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert, "BEDPE", "%s/oracle_peaks/macs_output" % (inputs['outdir']), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run()
def remove_badcigar(self, threads=1): job = jr.JobRunner() peek = [] if self.is_paired: # Find reads with bad CIGAR strings badcigar = "%s/%s.badcigar" %(self.output_dir, self.prefix) command = """zcat %s | \ awk 'BEGIN{FS="\t"; OFS="\t"} !/^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n=split(cigar, vals, "[A-Z]"); s=0; for(i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); if(s!=seqlen) print $1"\t";}' | \ sort | uniq > %s""" %(self.raw_sam, badcigar) job.append([[ command ]]) job.run() with open(badcigar) as fp: peek = [x for i,x in enumerate(fp) if i<10 and x.rstrip()] # Remove the reads with bad CIGAR strings if len(peek): command = "zcat %s | grep -vF -f %s | " %(self.raw_sam, badcigar)\ + "samtools view -@ %s -Su - | " %(threads) \ + "samtools sort -@ %s -o %s -" %(threads, self.raw_bam) else: command = "samtools view -@ %s -Su %s | " %(threads, self.raw_sam)\ + "samtools sort -@ %s -o %s -" %(threads, self.raw_bam) job.append([[ command ]]) job.run() job.append([[ "rm %s" %(badcigar) ]]) job.run()
def generate_bed(self, threads=1): if not os.path.exists(self.final_bam): raise Exception("final BAM file does not exist") job = jr.JobRunner() if self.is_paired: tmp_bam = "%s/%s.tmp.bam" %(self.output_dir, self.prefix) command = "samtools sort -@ %s -n -o %s %s" %(threads, tmp_bam, self.final_bam) job.append([[ command ]]) job.run() command = "bedtools bamtobed -bedpe -mate1 -i %s | " %(tmp_bam)\ + "gzip -c > %s" %(self.bedpe) job.append([[ command ]]) job.run() command = """zcat %s | \ awk 'BEGIN{OFS="\t"; FS="\t"} { chrom=$1; beg=$2; end=$6; if($2>$5){beg=$5} if($3>$6){end=$3} print chrom,beg,end }' - | sort --parallel=%s -k1,1 -k2,2n | \ gzip -c > %s""" %(self.bedpe, threads, self.bed) job.append([[ command ]]) job.run() job.append([[ "rm %s" %(tmp_bam) ]]) job.run() else: command = "bedtools bamtobed -i %s | cut -f1-3 | " %(self.final_bam)\ + "sort --parallel=%s -k1,1 -k2,2n | " %(threads)\ + "gzip -c > %s" %(self.bed)
def cross_correlation(self, bedpe, phantompeak_script, cpu_num): if not os.path.exists(bedpe): raise Exception("BEDPE file does not exist") job = jr.JobRunner() prefix = bedpe.split(".bedpe.")[0] subsample_tags = prefix + ".subsample.tagAlign.gz" cc_plot = prefix + "_crossCorrPlot.QC.pdf" cc_score = prefix + "_crossCorrScore.QC.txt" command = """zcat %s | grep -v 'chrM' | shuf -n 15000000 | \ awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"N","1000",$9}' | \ gzip -c > %s""" % (bedpe, subsample_tags) job.append([[command]]) job.run() command = "Rscript %s " %(phantompeak_script)\ + "-c=%s -p=%s -filtchr=chrM " %(subsample_tags, cpu_num)\ + "-savp=%s -out=%s" %(cc_plot, cc_score) job.append([[command]]) job.run() command = """sed -r 's/,[^\t]+//g' %s | \ awk -F ".subsample.tagAlign.gz" '{print $1".bam"$2}' \ >> %s.tmp""" % (cc_score, cc_score) job.append([[command]]) job.run() job.append([["mv %s.tmp %s" % (cc_score, cc_score)]]) job.append([["rm %s" % (subsample_tags)]]) job.run()
def run(self): job = jr.JobRunner() # narrowPeak np_oracle = "%s/%s/macs_output/%s/%s.narrowPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) np_psr1 = "%s/%s/macs_output/psr_%s.00/psr_%s.00.narrowPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) np_psr2 = "%s/%s/macs_output/psr_%s.01/psr_%s.01.narrowPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) # broadPeak bp_oracle = "%s/%s/macs_output/%s/%s.broadPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) bp_psr1 = "%s/%s/macs_output/psr_%s.00/psr_%s.00.broadPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) bp_psr2 = "%s/%s/macs_output/psr_%s.01/psr_%s.01.broadPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) # gappedPeak gp_oracle = "%s/%s/macs_output/%s/%s.gappedPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) gp_psr1 = "%s/%s/macs_output/psr_%s.00/psr_%s.00.gappedPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) gp_psr2 = "%s/%s/macs_output/psr_%s.01/psr_%s.01.gappedPeak.gz" % ( self.out, self.input_file, self.input_file, self.input_file) if not os.path.isdir("%s/gappedPeaks/" % (self.out)): os.mkdir("%s/gappedPeaks/" % (self.out)) if not os.path.isdir("%s/broadPeaks/" % (self.out)): os.mkdir("%s/broadPeaks/" % (self.out)) if not os.path.isdir("%s/narrowPeaks/" % (self.out)): os.mkdir("%s/narrowPeaks/" % (self.out)) job.append([["bedtools intersect \ -a %s -b %s -f 0.50 -F 0.50 -e |\ bedtools intersect \ -a stdin -b %s -f 0.50 -F 0.50 -e -u> \ %s/narrowPeaks/%s.final.narrowPeak.gz"\ %(np_oracle,np_psr1,np_psr2, \ self.out, self.input_file)]]) job.append([["bedtools intersect \ -a %s -b %s -f 0.50 -F 0.50 -e |\ bedtools intersect \ -a stdin -b %s -f 0.50 -F 0.50 -e -u > \ %s/broadPeaks/%s.final.broadPeak.gz"\ %(bp_oracle,bp_psr1,np_psr2, \ self.out, self.input_file)]]) job.append([["bedtools intersect \ -a %s -b %s -f 0.50 -F 0.50 -e |\ bedtools intersect \ -a stdin -b %s -f 0.50 -F 0.50 -e -u > \ %s/gappedPeaks/%s.final.gappedPeak.gz"\ %(gp_oracle,gp_psr1,np_psr2, \ self.out, self.input_file)]]) job.run()
def run(self): job = jr.JobRunner() # Shuffle input print("Shuffle input") shuf_out = os.path.join(self.out, "shuf_%s.bedpe" % (self.prefix)) job.append([[ "zcat %s | shuf --random-source=%s > %s" % (self.input_file, self.input_file, shuf_out) ]]) job.run() # Split into two files print("split in two") psr_prefix = os.path.join(self.out, "psr_%s." % (self.prefix)) job.append([[ "split -d -nl/2 --additional-suffix=\".bedpe\"\ %s %s" % (shuf_out, psr_prefix) ]]) job.run() # TODO core to be set according to input parameters print("sort") job = jr.JobRunner() job.append([[ "sort --parallel=%s -S 2G \ -k1,1 -k2,2n %s00.bedpe | gzip -c > %s00.bedpe.gz" % (self.cpus, psr_prefix, psr_prefix) ]]) job.append([[ "sort --parallel=%s -S 2G \ -k1,1 -k2,2n %s01.bedpe | gzip -c > %s01.bedpe.gz" % (self.cpus, psr_prefix, psr_prefix) ]]) #job.append([["zcat %s | sort --parallel=%s -S 2G \ # -k1,1 -k2,2n -o %s && gzip -c %s" # %(self.input_file, self.cpus, self.input_file, self.input_file )]]) job.run() # Clean job.append([[ "rm %s00.bedpe %s01.bedpe %s" % (psr_prefix, psr_prefix, shuf_out) ]]) job.run()
def main(inputs): with open('%s/inputs.json' % (inputs['outdir']), 'w') as outfile: json.dump(inputs, outfile) job = jr.JobRunner(cpus=inputs['threads']) print("--- Welcome to the pipeline ---") if inputs['pooled']: print("Warning : The controls libraries will be pooled together.") for idx in range(len(inputs['inputs'])): job.append([[ "python %s/run_psychip_pool_input.py %s/inputs.json map %s inputs" % (os.path.dirname( os.path.abspath(__file__)), inputs['outdir'], idx) ]]) for idx in range(len(inputs['controls'])): job.append([[ "python %s/run_psychip_pool_input.py %s/inputs.json map %s controls" % (os.path.dirname( os.path.abspath(__file__)), inputs['outdir'], idx) ]]) job.run() job.append([[ "python %s/run_psychip_pool_input.py %s/inputs.json pooling" % (os.path.dirname(os.path.abspath(__file__)), inputs['outdir']) ]]) job.run() for idx in range(len(inputs['inputs']) + 1): # the plus one is for the pooled input job.append([[ "python %s/run_psychip_pool_input.py %s/inputs.json peak %s" % (os.path.dirname( os.path.abspath(__file__)), inputs['outdir'], idx) ]]) job.run() else: for idx in range(len(inputs['inputs'])): job.append([[ "python %s/run_psychip.py %s/inputs.json %s" % (os.path.dirname( os.path.abspath(__file__)), inputs['outdir'], idx) ]]) job.run() job.append([[ "python %s/run_psychip_pool_input.py %s/inputs.json pooling" % (os.path.dirname(os.path.abspath(__file__)), inputs['outdir']) ]]) job.run() job.append([[ "python %s/run_psychip_pool_input.py %s/inputs.json peak %s" % (os.path.dirname(os.path.abspath(__file__)), inputs['outdir'], len(inputs['inputs']) + 1) ]]) job.run()
def clean(self): # Clean up intermediate files job = jr.JobRunner() job.append([[ "rm %s" %(self.raw_sam) ]]) job.append([[ "rm %s" %(self.raw_bam) ]]) job.append([[ "rm %s" %(self.flt_bam) ]]) job.run() self.raw_sam = None self.raw_bam = None self.flt_bam = None
def peak(inputs, idx): job = jr.JobRunner() if inputs['fraglen'] > 0: insert = inputs['fraglen'] elif inputs['spp']: job.append([[ "cat %s/%s/bwa_out/*.cc.qc | cut -f3" % (inputs['outdir'], inputs['prefix_inputs'][idx]) ]]) insert = job.run()[0].rstrip() else: job.append([[ "grep -A2 \"## METRICS CLASS\" %s/%s/bwa_out/InsertSizeMetrics.txt | tail -1 | cut -f1" % (inputs['outdir'], inputs['prefix_inputs'][idx]) ]]) insert = job.run()[0].rstrip() ### PEAK run_peak = pc.Macs2PeakCaller( "%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert, "BEDPE", "%s/%s/macs_output/%s" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run() run_peak = pc.Macs2PeakCaller( "%s/%s/macs_input/psr_%s.00.bedpe.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert, "BEDPE", "%s/%s/macs_output/psr_%s.00" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run() run_peak = pc.Macs2PeakCaller( "%s/%s/macs_input/psr_%s.01.bedpe.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert, "BEDPE", "%s/%s/macs_output/psr_%s.01" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run() ### OVERLAP run_overlap = ovrlp.Overlap(inputs['prefix_inputs'][idx], inputs['outdir']) run_overlap.run()
def get_mapstats(self, raw_bam, final_bam): if not os.path.exists(raw_bam): raise Exception("raw BAM file does not exist") if not os.path.exists(final_bam): raise Exception("final BAM file does not exist") job = jr.JobRunner() raw_bam_qc = raw_bam[:-3] + "flagstat.QC.txt" final_bam_qc = final_bam[:-3] + "flagstat.QC.txt" command = "samtools flagstat %s > %s" job.append([[command % (raw_bam, raw_bam_qc)]]) job.append([[command % (final_bam, final_bam_qc)]]) job.run()
def get_library_complexity(self, flt_bam, is_paired=True, threads=1): if not os.path.exists(flt_bam): raise Exception("filtered BAM file does not exist") job = jr.JobRunner() prefix = os.path.basename(flt_bam).split(".flt.")[0] tmp_bam = "%s/%s.tmp.bam" % (self.output_dir, prefix) qc_pbc = "%s/%s.libComplexity.QC.txt" % (self.output_dir, prefix) command = "samtools sort -n -@ %s " %(threads)\ + "-o %s %s" %(tmp_bam, flt_bam) job.append([[command]]) job.run() job.append([["mv %s %s" % (tmp_bam, flt_bam)]]) job.run() # Library Complexity # [1] TotalReadPairss # [2] DistinctReadPairs # [3] OneReadPairs # [4] TwoReadPairs # [5] NRF=Distinct/Total # [6] PBC1=OnePair/Distinct # [7] PBC2=OnePair/TwoPair command = """echo 1 | \ awk '{print "#Total\tDistinct\tOne\tTwo\tNRF\tPBC1\tPBC2"}' \ > %s""" % (qc_pbc) job.append([[command]]) job.run() if is_paired: command = """bedtools bamtobed -bedpe -i %s | \ awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}' | """ else: command = """bedtools bamtobed -i %s | " \ awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}' | """ command += """grep -v 'chrM' | sort | uniq -c | \ awk 'BEGIN{mt=0;m0=0;m1=0;m2=0; OFS="\t"} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{print mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'\ >> %s""" command = command % (flt_bam, qc_pbc) job.append([[command]]) job.run()
def run(self, cpu_num): if cpu_num <= 0: raise Exeption("The number of CPU must be > 0!") job = jr.JobRunner() if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) sam_output = "%s/%s.raw.sam.gz" % (self.output_dir, self.prefix) command = "bwa mem -M -t %s %s " %(cpu_num, self.genome_index) \ + "%s %s " %(self.fastq1, self.fastq2) \ + "| gzip -c > %s" %(sam_output) job.append([[command]]) job.run()
def pool_libraries(inputs): if not os.path.isdir("%s/oracle_peaks" % (inputs['outdir'])): os.mkdir("%s/oracle_peaks" % (inputs['outdir'])) job = jr.JobRunner() cmd_inputs = "zcat " for prefix in inputs['prefix_inputs']: cmd_inputs = cmd_inputs + "%s/%s/macs_input/%s.bed.gz " % ( inputs['outdir'], prefix, prefix) cmd_inputs = cmd_inputs + "| sort --parallel=%s -k1,1 -k2,2n | gzip -c > %s/oracle_peaks/pooled_inputs.bed.gz" % ( inputs['cpus'], inputs['outdir']) cmd_controls = "zcat " for prefix in inputs['prefix_controls']: cmd_controls = cmd_controls + "%s/%s/macs_input/%s.bed.gz " % ( inputs['outdir'], prefix, prefix) cmd_controls = cmd_controls + "| sort --parallel=%s -k1,1 -k2,2n | gzip -c > %s/oracle_peaks/pooled_controls.bed.gz" % ( inputs['cpus'], inputs['outdir']) job.append([[cmd_controls]]) job.append([[cmd_inputs]]) job.run()
def remove_artifacts(self, picard_jar, threads = 1, sponge = None): if not os.path.exists(self.flt_bam): raise Exception("filtered BAM file does not exist!") job = jr.JobRunner() tmp_bam = "%s/%s.tmp.bam" %(self.output_dir, self.prefix) qc_dups = "%s/%s.pcrDups.QC.txt" %(self.output_dir, self.prefix) # Mark PCR duplicates and generate QC file with Picard command = "java -Xmx4G -jar %s MarkDuplicates " %(picard_jar)\ + "INPUT=%s OUTPUT=%s " %(self.flt_bam, tmp_bam)\ + "METRICS_FILE=%s ASSUME_SORTED=true " %(qc_dups)\ + "VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false" job.append([[ command ]]) job.run() job.append([[ "mv %s %s" %(tmp_bam, self.flt_bam) ]]) job.run() pair_opt = "" if self.is_paired: pair_opt = "-f2" if sponge: command = "samtools view -@ %s " %(threads)\ + "-F1804 %s -h %s | " %(pair_opt, self.flt_bam)\ + "grep -vF -f %s - | " %(sponge)\ + "samtools view -@ %s -b -o %s -" %(threads, self.final_bam) job.append([[ command ]]) else: command = "samtools view -@ %s -F1804 %s -b " %(threads, pair_opt)\ + "-o %s %s" %(self.final_bam, self.flt_bam) job.append([[ command ]]) job.run() # Make BAM index file command = "samtools index %s %s" %(self.final_bam, self.final_bai) job.append([[ command ]]) job.run()
def run_main(inputs, idx): ### ALIGN CONTROL LIBRARIES run_map_control = mp.BwaMapper( inputs['prefix_controls'][idx], #prefix inputs['index'], # bwa index os.path.join(os.path.join(inputs['outdir'],\ inputs['prefix_controls'][idx]),\ "bwa_out"), # output directory inputs['controls'][idx][0], # fastq read 1 inputs['controls'][idx][1]) # fastq read 2 run_map_control.run(inputs['cpus']) # CPU for BWA # Output: R1.raw.sam.gz # Format: <prefix>.raw.sam.gz ### Post alignment filtering run_filter_control = pf.PostFilter( "%s/%s/bwa_out/%s.raw.sam.gz" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), "%s/%s/bwa_out" % (inputs['outdir'], inputs['prefix_controls'][idx])) # output directory ## Step 1: remove reads with bad CIGAR strings run_filter_control.remove_badcigar(inputs['cpus']) # Threads for SAMtools # Output: R1.raw.bam # Format: <prefix>.raw.bam ## Step 2: remove unmapped, low quality, orphan, and supplemental reads run_filter_control.remove_aberrant_reads( inputs['cpus']) # Threads for SAMtools # Output: R1.flt.bam # Format: <prefix>.flt.bam ## Step 3: remove PCR duplicates and sponge mapping reads ## (currently, "java -Xmx4G". Please change it if needed) run_filter_control.remove_artifacts( inputs['picard'], # path to picard.jar inputs['threads'], inputs['sponges']) # sponge name list # Output: R1.final.[bam|bai], R1.pcrDups.QC.txt ## Step 4: generate BED and BEDPE files ## (those files are *unsorted*) run_filter_control.generate_bed(inputs['cpus']) # Threads for SAMtools # Output: R1.bed.gz (for pseudorep) # Output: R1.bedpe.gz (bedtool raw output) if not os.path.isdir("%s/%s/macs_input/" % (inputs['outdir'], inputs['prefix_controls'][idx])): os.mkdir("%s/%s/macs_input/" % (inputs['outdir'], inputs['prefix_controls'][idx])) if not os.path.isfile("%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx])): shutil.move("%s/%s/bwa_out/%s.bed.gz" %(inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]),\ "%s/%s/macs_input/%s.bed.gz" %(inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx])) ### QC part INPUT LIBRARIES control_qc = qc.QC( "%s/%s/bwa_out/" % (inputs['outdir'], inputs['prefix_controls'][idx])) # output direcotry ## Step 1: Get map stats control_qc.get_mapstats( "%s/%s/bwa_out/%s.raw.bam" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), "%s/%s/bwa_out/%s.final.bam" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx])) # Output: R1.final.flagstat.QC.txt ## Step 2: Compute library complexity control_qc.get_library_complexity( "%s/%s/bwa_out/%s.flt.bam" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), inputs['cpus']) # Threads for SAMtools job = jr.JobRunner() if inputs['fraglen'] > 0: insert = inputs['fraglen'] elif inputs['spp']: ### Find insert size control_xcor = xcor.Xcor( "%s/%s/bwa_out/%s.final.bam" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), "%s/common/run_spp_nodups.R" % (os.path.dirname(os.path.abspath(__file__))), inputs['cpus']) control_xcor.process() else: job.append([["java -jar %s CollectInsertSizeMetrics I=%s/%s/bwa_out/%s.final.bam\ O=%s/%s/bwa_out/InsertSizeMetrics.txt AS=F H=%s/%s/bwa_out/InsertSizeMetrics.histogram" \ %(inputs['picard'],inputs['outdir'],inputs['prefix_controls'][idx],inputs['prefix_controls'][idx],\ inputs['outdir'],inputs['prefix_controls'][idx],\ inputs['outdir'],inputs['prefix_controls'][idx])]]) job.run() run_filter_control.clean() ### END CONTROL LIBRARIES ### ALIGN INPUT LIBRARIES run_map_input = mp.BwaMapper( inputs['prefix_inputs'][idx], #prefix inputs['index'], # bwa index os.path.join(os.path.join(inputs['outdir'],\ inputs['prefix_inputs'][idx]),\ "bwa_out"), # output directory inputs['inputs'][idx][0], # fastq read 1 inputs['inputs'][idx][1]) # fastq read 2 run_map_input.run(inputs['cpus']) # CPU for BWA # Output: R1.raw.sam.gz # Format: <prefix>.raw.sam.gz ### Post alignment filtering run_filter_input = pf.PostFilter("%s/%s/bwa_out/%s.raw.sam.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx],inputs['prefix_inputs'][idx]), \ "%s/%s/bwa_out" %(inputs['outdir'], inputs['prefix_inputs'][idx])) # output directory ## Step 1: remove reads with bad CIGAR strings run_filter_input.remove_badcigar(inputs['cpus']) # Threads for SAMtools # Output: R1.raw.bam # Format: <prefix>.raw.bam ## Step 2: remove unmapped, low quality, orphan, and supplemental reads run_filter_input.remove_aberrant_reads( inputs['cpus']) # Threads for SAMtools # Output: R1.flt.bam # Format: <prefix>.flt.bam ## Step 3: remove PCR duplicates and sponge mapping reads ## (currently, "java -Xmx4G". Please change it if needed) run_filter_input.remove_artifacts( inputs['picard'], # path to picard.jar inputs['threads'], inputs['sponges']) # sponge name list # Output: R1.final.[bam|bai], R1.pcrDups.QC.txt ## Step 4: generate BED and BEDPE files ## (those files are *unsorted*) run_filter_input.generate_bed(inputs['cpus']) # Threads for SAMtools # Output: R1.bed.gz (for pseudorep) # Output: R1.bedpe.gz (bedtool raw output) if not os.path.isdir("%s/%s/macs_input/" % (inputs['outdir'], inputs['prefix_inputs'][idx])): os.mkdir("%s/%s/macs_input/" % (inputs['outdir'], inputs['prefix_inputs'][idx])) if not os.path.isfile("%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx])): shutil.move("%s/%s/bwa_out/%s.bed.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]),\ "%s/%s/macs_input/%s.bed.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx])) ### QC part INPUT LIBRARIES test_qc = qc.QC( "%s/%s/bwa_out/" % (inputs['outdir'], inputs['prefix_inputs'][idx])) # output direcotry ## Step 1: Get map stats test_qc.get_mapstats( "%s/%s/bwa_out/%s.raw.bam" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/%s/bwa_out/%s.final.bam" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx])) # Output: R1.final.flagstat.QC.txt ## Step 2: Compute library complexity test_qc.get_library_complexity( "%s/%s/bwa_out/%s.flt.bam" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['cpus']) # Threads for SAMtools # Output: R1.libComplexity.QC.txt ## Find insert size job = jr.JobRunner() if inputs['fraglen'] > 0: insert = inputs['fraglen'] elif inputs['spp']: ### Find insert size input_xcor = xcor.Xcor( "%s/%s/bwa_out/%s.final.bam" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/common/run_spp_nodups.R" % (os.path.dirname(os.path.abspath(__file__))), inputs['cpus']) input_xcor.process() job.append([[ "cat %s/%s/bwa_out/*.cc.qc | cut -f3" % (inputs['outdir'], inputs['prefix_inputs'][idx]) ]]) insert = int(job.run()[0].rstrip()) #with open(,'r') as fh: # firstline = fh.readline() # insert = firstline.split()[2] #third column else: job.append([["java -jar %s CollectInsertSizeMetrics I=%s/%s/bwa_out/%s.final.bam\ O=%s/%s/bwa_out/InsertSizeMetrics.txt AS=F H=%s/%s/bwa_out/InsertSizeMetrics.histogram" \ %(inputs['picard'],inputs['outdir'],inputs['prefix_inputs'][idx],inputs['prefix_inputs'][idx],\ inputs['outdir'],inputs['prefix_inputs'][idx],\ inputs['outdir'],inputs['prefix_inputs'][idx])]]) job.run() job.append([[ "grep -A2 \"## METRICS CLASS\" %s/%s/bwa_out/InsertSizeMetrics.txt | tail -1 | cut -f1" % (inputs['outdir'], inputs['prefix_inputs'][idx]) ]]) insert = job.run()[0].rstrip() run_filter_input.clean() ### PSEUDOREP run_psr = psr.PseudorepsGenerator("%s/%s/macs_input/%s.bed.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]),\ inputs['prefix_inputs'][idx], "%s/%s/macs_input/" %(inputs['outdir'], inputs['prefix_inputs'][idx]), inputs['threads'], inputs['cpus']) run_psr.run() ### PEAK run_peak = pc.Macs2PeakCaller( "%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), insert, "BEDPE", "%s/%s/macs_output/%s" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run() run_peak = pc.Macs2PeakCaller( "%s/%s/macs_input/psr_%s.00.bedpe.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), insert, "BEDPE", "%s/%s/macs_output/psr_%s.00" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run() run_peak = pc.Macs2PeakCaller( "%s/%s/macs_input/psr_%s.01.bedpe.gz" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), "%s/%s/macs_input/%s.bed.gz" % (inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]), insert, "BEDPE", "%s/%s/macs_output/psr_%s.01" % (inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]), inputs['genome'], "hs", inputs['macs2'], inputs['common']) run_peak.run() ### OVERLAP run_overlap = ovrlp.Overlap(inputs['prefix_inputs'][idx], inputs['outdir']) run_overlap.run()
def check_arguments(args): checker = u.Utils() inputs = {} try: # Check if the input files exist and are well formatted os.path.isfile(args['f']) inputs['inputs'], inputs['controls'], inputs['prefix_inputs'], inputs[ 'prefix_controls'] = input_integrity(args['f']) # Check if the output folder exist if os.path.isdir(args['o']): raise Exception( "mkdir: cannot create directory '%s': File exists." % (args['o'])) else: os.makedirs(args['o']) inputs['outdir'] = args['o'] # Check if Bowtie index exists if args['b'].endswith('.bwt'): prefix_index = os.path.splitext(args['b'])[0] else: prefix_index = args['b'] for suffix in ['.bwt', '.amb', '.ann', '.pac', '.sa']: if not os.path.isfile("%s%s" % (prefix_index, suffix)): raise Exception("The BWA index file '%s%s' does not exist." % (prefix_index, suffix)) inputs['index'] = prefix_index # Check if genome file exists if not os.path.isfile(args['g']): raise Exception("The genome chromInfo file '%s' does not exist." % (args['g'])) inputs['genome'] = args['g'] if args['sponges'] != None: if not os.path.isfile(args['sponges']): raise Exception("The sponges file '%s' does not exist." % (args['sponges'])) inputs['sponges'] = args['sponges'] else: inputs['sponges'] = None if args['macs2'] != None: if os.path.exists(args['macs2']): inputs['macs2'] = args['macs2'] else: raise Exception("Macs2 not found in '%s'" % (args['macs2'])) else: try: checker.which('macs2') inputs['macs2'] = "macs2" except Exception as e: raise (e) # if not os.path.exists(args['picard']): # raise Exception("Picard not found in '%s'" %(args['picard'])) # else: # inputs['picard'] = args['picard'] inputs['fraglen'] = -1 if args['fraglen'] != None: if args['spp']: raise Exception( "'--spp' and '--fraglen' are mutually exclusive. Please select only one." ) else: inputs['fraglen'] = args['fraglen'] else: if args['spp']: try: j = jr.JobRunner() j.append([['Rscript ./common/test.R']]) j.run() inputs['spp'] = True except: raise Exception( "Spp is not currently installed or cannot be loaded.") else: inputs['spp'] = False inputs['fraglen'] = -1 print('Picard tools will be used to estimate insert size.') inputs['threads'] = args['t'] inputs['cpus'] = args['c'] inputs['common'] = os.path.join( os.path.dirname(os.path.abspath(__file__)), "common") inputs['picard'] = os.path.join( os.path.dirname(os.path.abspath(__file__)), "common/picard-tools-1.141/picard.jar") # If not enough controls are supplied, use pooled controls. if len(inputs['inputs']) != len(inputs['controls']): inputs['pooled'] = True else: inputs['pooled'] = False print("Argument Verification completed") except Exception as e: raise (e) return inputs
def run(self): job=jr.JobRunner() #Define the output filenames peaks_dirname = self.output_name if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) prefix =self.experiment_name.split("/")[-1] if prefix.endswith('.bedpe.gz'): prefix = prefix[:-9] elif prefix.endswith('.bed.gz') : prefix = prefix[:-7] narrowPeak_fn = "%s/%s.narrowPeak" %(peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" %(peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" %(peaks_dirname, prefix) self.narrowPeak_gz_fn = narrowPeak_fn + ".gz" self.gappedPeak_gz_fn = gappedPeak_fn + ".gz" self.broadPeak_gz_fn = broadPeak_fn + ".gz" self.narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn) self.gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn) self.broadPeak_bb_fn = "%s.bb" %(broadPeak_fn) self.fc_signal_fn = "%s/%s.fc_signal.bw" \ %(peaks_dirname, prefix) self.pvalue_signal_fn = "%s/%s.pvalue_signal.bw" \ %(peaks_dirname, prefix) # Extract the fragment length estimate from \ # column 3 of the cross-correlation scores file # if self.xcor_scores_input_name == "150": fraglen=self.xcor_scores_input_name # else: # with open(self.xcor_scores_input_name,'r') as fh: # firstline = fh.readline() # fraglen = firstline.split()[2] #third column # print "Fraglen %s" %(fraglen) #=========================================== command = '%s callpeak '%(self.macs2) + \ '-t %s -c %s '\ %(self.experiment_name, self.control_name) + \ '-f %s -n %s/%s '\ %(self.input_type, peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s \ --keep-dup all -B --SPMR' %(self.genomesize, fraglen) job.append([["%s" %(command)]]) job.run() #print command #returncode = common.block_on(command) #print "MACS2 exited with returncode %d" %(returncode) #assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform \ # to narrowPeak.as format (score must be <1000) rescaled_narrowpeak_fn = utils.common.rescale_scores( '%s/%s_peaks.narrowPeak' \ %(peaks_dirname, prefix), scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> command = "sort -k 8gr,8gr %s |\ awk 'BEGIN{OFS=\"\\t\"}{$4=\"Peak_\"NR ; \ print $0}' | tee %s | gzip -c > %s" \ %(rescaled_narrowpeak_fn, narrowPeak_fn, self.narrowPeak_gz_fn) job.append([["%s" %(command)]]) job.run() # remove additional files command ="rm -f %s/%s_peaks.xls %s/%s_peaks.bed %s_summits.bed"\ %(peaks_dirname, prefix, peaks_dirname, prefix, prefix) job.append([["%s" %(command)]]) job.run() #=========================================== # Generate Broad and Gapped Peaks #============================================ command = '%s callpeak ' %(self.macs2) + \ '-t %s -c %s ' \ %(self.experiment_name, self.control_name) + \ '-f %s -n %s/%s '%(self.input_type, peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 \ --extsize %s --keep-dup all' \ %(self.genomesize, fraglen) #print command job.append([["%s" %(command)]]) job.run() # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_broadpeak_fn = utils.common.rescale_scores('%s/%s_peaks.broadPeak' %(peaks_dirname, prefix), scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending order and replace long peak names in Column 4 with Peak_<peakRank> command = "sort -k 8gr,8gr %s | \ awk 'BEGIN{OFS=\"\\t\"}{$4=\"Peak_\"NR ; print $0}'|\ tee %s | gzip -c > %s"%(rescaled_broadpeak_fn, broadPeak_fn, self.broadPeak_gz_fn) #print command job.append([["%s" %(command)]]) job.run() # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_gappedpeak_fn = utils.common.rescale_scores('%s/%s_peaks.gappedPeak' %(peaks_dirname, prefix), scores_col=5) command = "sort -k 14gr,14gr %s | \ awk 'BEGIN{OFS=\"\\t\"}{$4=\"Peak_\"NR ; print $0}'|\ tee %s | gzip -c > %s"%(rescaled_gappedpeak_fn,\ gappedPeak_fn, self.gappedPeak_gz_fn) #print command job.append([["%s" %(command)]]) job.run() # remove additional file job.append([["rm -f %s/%s_peaks.xls %s/%s_peaks.bed %s_summits.bed" %(peaks_dirname,prefix,peaks_dirname,prefix,prefix)]]) job.run() #=========================================== # For Fold enrichment signal tracks #============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp). command = '%s bdgcmp ' %(self.macs2) + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \ '-m FE' #print command job.append([["%s" %(command)]]) job.run() # Remove coordinates outside chromosome sizes (stupid MACS2 bug) job.append([["bedtools slop -i %s/%s_FE.bdg -g %s -b 0 |\ %s/bedClip stdin %s %s/%s.fc.signal.bedgraph" %(peaks_dirname, prefix, self.chrom_sizes_name, self.common, self.chrom_sizes_name, peaks_dirname, prefix)]]) #print command job.run() #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = '%s/bedGraphToBigWig '%(self.common) + \ '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix)+\ '%s %s' %(self.chrom_sizes_name, self.fc_signal_fn) #print command job.append([["%s" %(command)]]) job.run() #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph #=========================================== # For -log10(p-value) signal tracks #=========================================== # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 if (self.input_type=="BED" or self.input_type=="BEDPE") : job.append([['gzip -dc %s' %(self.experiment_name)+'|wc -l']]) chipReads = int(job.run()[0]) job.append([['gzip -dc %s' %(self.control_name)+'|wc -l']]) controlReads = int(job.run()[0]) sval=str(min(float(chipReads), float(controlReads))/1000000) else: job.append([["samtools idxstats %s | awk '{sum=sum+$3}END{print sum}'"%(self.experiment_name)]]) chipReads = int(job.run()[0]) job.append([["samtools idxstats %s | awk '{sum=sum+$3}END{print sum}'"%(self.control_name)]]) controlReads = int(job.run()[0]) sval=str(min(float(chipReads), float(controlReads))/1000000) # print sval,chipReads,controlReads print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval) job.append([['%s bdgcmp ' %(self.macs2) + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \ '-m ppois -S %s' %(sval)]]) job.run() # Remove coordinates outside chromosome sizes (stupid MACS2 bug) job.append([['bedtools slop -i %s/%s_ppois.bdg -g %s -b 0' %(peaks_dirname, prefix, self.chrom_sizes_name)+ \ '| %s/bedClip stdin %s %s/%s.pval.signal.bedgraph' %(self.common, self.chrom_sizes_name,peaks_dirname, prefix)]]) job.run() job.append([["rm -rf %s/%s_ppois.bdg" %(peaks_dirname,prefix)]]) job.run() # Convert bedgraph to bigwig command = '%s/bedGraphToBigWig ' %(self.common) + \ '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s %s' %(self.chrom_sizes_name, self.pvalue_signal_fn) job.append([["%s" %(command)]]) job.run() job.append([["rm -f %s/%s.pval.signal.bedgraph" %(peaks_dirname,prefix)]]) job.append([["rm -f %s/%s_treat_pileup.bdg %s_control_lambda.bdg" %(peaks_dirname,prefix,prefix)]]) job.run() #=========================================== # Generate bigWigs from beds to support trackhub visualization of peak files #============================================ narrowPeak_bb_fname = utils.common.bed2bb('%s' %(narrowPeak_fn), '%s' %(self.chrom_sizes_name), '%s' %(self.narrowPeak_as_name), bed_type='bed6+4') gappedPeak_bb_fname = utils.common.bed2bb('%s' %(gappedPeak_fn), '%s' %(self.chrom_sizes_name), '%s' %(self.gappedPeak_as_name), bed_type='bed12+3') broadPeak_bb_fname = utils.common.bed2bb('%s' %(broadPeak_fn), '%s' %(self.chrom_sizes_name), '%s' %(self.broadPeak_as_name), bed_type='bed6+3')
def process(self): job = jr.JobRunner() self.intermediate_TA_filename = self.input_bam_basename + ".tagAlign" if self.paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' self.final_TA_filename = self.input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== job.append([[ "bamToBed -i %s | " % (self.input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "| tee %s | " % (self.intermediate_TA_filename), "gzip -c > %s" % (self.final_TA_filename) ]]) job.run() # ================ # Create BEDPE file # ================ if self.paired_end: self.final_BEDPE_filename = self.input_bam_basename + ".bedpe.gz" #need namesorted bam to make BEDPE self.final_nmsrt_bam_prefix = self.input_bam_basename + ".nmsrt" self.final_nmsrt_bam_filename = self.final_nmsrt_bam_prefix + ".bam" job.append([["samtools sort -@ %s -n -o %s %s" \ %(self.cpus, self.final_nmsrt_bam_filename, self.input_bam_filename)]]) job.run() job.append([[ "bedtools bamtobed -bedpe -mate1 -i %s | " % (self.final_nmsrt_bam_filename), "gzip -c > %s" % (self.final_BEDPE_filename) ]]) job.run() # ================================= # Subsample tagAlign file # ================================ NREADS = 15000000 if self.paired_end: end_infix = 'MATE1' else: end_infix = 'SE' self.subsampled_TA_filename = self.input_bam_basename + ".filt.nodup.sample.%d.%s.tagAlign.gz" % ( NREADS / 1000000, end_infix) steps = 'grep -v "chrM" %s | ' %(self.intermediate_TA_filename) + \ 'shuf -n %d | ' %(NREADS) if self.paired_end: steps += r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' | """ steps += 'gzip -c > %s' % (self.subsampled_TA_filename) print(steps) job.append([[steps]]) job.run() # Calculate Cross-correlation QC scores self.CC_scores_filename = self.subsampled_TA_filename + ".cc.qc" self.CC_plot_filename = self.subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag print("Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (self.spp, self.subsampled_TA_filename, self.cpus, self.CC_plot_filename, self.CC_scores_filename)) job.append([[ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s > /dev/null 2>&1" \ %(self.spp, self.subsampled_TA_filename, self.cpus, self.CC_plot_filename, self.CC_scores_filename)]]) job.run() job.append([[ r"""sed -r 's/,[^\t]+//g' %s > %s """ % (self.CC_scores_filename, "temp") ]]) job.run() job.append([["mv temp %s" % (self.CC_scores_filename)]]) job.run()