def splitFile(file_in, outputdir): print "" print "splitting file :" + file_in + "..." command1 = "gunzip " + file_in # unzip file subprocess.call(command1, shell=True) unzipname = outputdir + getPrefix(file_in) + ".tagAlign" prprefix = outputdir + getPrefix(file_in) linecount = len(open(unzipname, 'rU').readlines()) # count number of lines in file linecount = linecount / 2 # get half number of lines command2 = "gzip " + unzipname # zip file subprocess.call(command2, shell=True) command3 = "zcat " + file_in + "|shuf|split -d -l " + str( linecount ) + " - " + prprefix # shuffle the file and then split it into two size equal files subprocess.call(command3, shell=True) command4 = "gzip " + prprefix + "00" # zip the two resulting files command5 = "gzip " + prprefix + "01" command6 = "mv " + prprefix + "00.gz " + prprefix + "_PR1.tagAlign.gz" # rename the two files command7 = "mv " + prprefix + "01.gz " + prprefix + "_PR2.tagAlign.gz" subprocess.call(command4, shell=True) subprocess.call(command5, shell=True) subprocess.call(command6, shell=True) subprocess.call(command7, shell=True) print file_in + " splitted..." return
def createFinalSets2(chipfile, control, np, outputdir, prefix): print "Creating peak set..." tmp = getPrefix(chipfile) tmpctrl = getPrefix(control) filein = outputdir+"/PeakCalling/"+tmp+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz" fileout = outputdir+"/finalsets/"+prefix+"_spp.regionPeak.gz" command = "zcat "+filein+"|sort -k7nr,7nr|head -n "+str(np)+"|gzip -c > "+fileout # get only the np first peaks subprocess.call(command, shell=True) print "Done" return
def plotResults2(outputdir, rep1, rep2): print "Plotting IDR results for "+rep1+" and "+rep2+"..." tmp1 = getPrefix(rep1) tmp2 = getPrefix(rep2) fileout = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2 filename = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 "+fileout+" "+filename+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r" # call R scripts to plot results command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename+"-overlapped-peaks.txt -o="+fileout subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) print "Done" return
def plotResults2(outputdir, rep1, rep2): print "Plotting IDR results for " + rep1 + " and " + rep2 + "..." tmp1 = getPrefix(rep1) tmp2 = getPrefix(rep2) fileout = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2 filename = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 " + fileout + " " + filename + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r" # call R scripts to plot results command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename + "-overlapped-peaks.txt -o=" + fileout subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) print "Done" return
def consistency(file1, file2, control, outputdir): # perform IDR analysis print "" print "IDR analysis for :" + file1 + " vs " + file2 + "..." tmp1 = getPrefix(file1) tmp2 = getPrefix(file2) tmpctrl = getPrefix(control) file_in1 = outputdir + "/PeakCalling/" + tmp1 + ".tagAlign_VS_" + tmpctrl + ".tagAlign.regionPeak.gz" file_in2 = outputdir + "/PeakCalling/" + tmp2 + ".tagAlign_VS_" + tmpctrl + ".tagAlign.regionPeak.gz" outname = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 command = "Rscript $RCHIPpipe_PATH/batch-consistency-analysis.r " + file_in1 + " " + file_in2 + " -1 " + outname + " 0 F signal.value $RCHIPpipe_PATH/functions-all-clayton-12-13.r $RCHIPpipe_PATH/genome_table.txt" subprocess.call(command, shell=True) print "Done" return
def consistency(file1, file2, control, outputdir): # perform IDR analysis print "" print "IDR analysis for :"+file1+" vs "+file2+"..." tmp1 = getPrefix(file1) tmp2 = getPrefix(file2) tmpctrl = getPrefix(control) file_in1 = outputdir+"/PeakCalling/"+tmp1+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz" file_in2 = outputdir+"/PeakCalling/"+tmp2+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz" outname = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 command = "Rscript $RCHIPpipe_PATH/batch-consistency-analysis.r "+file_in1+" "+file_in2+" -1 "+outname+" 0 F signal.value $RCHIPpipe_PATH/functions-all-clayton-12-13.r $RCHIPpipe_PATH/genome_table.txt" subprocess.call(command, shell=True) print "Done" return
def countConsistentPeaks(file1, file2, outputdir, thresh): # count number of peaks with specified IDR (or lower in resulting file) tmp1 = getPrefix(file1) tmp2 = getPrefix(file2) file_name = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2+"-overlapped-peaks.txt" f = open(file_name,'r') # open resulting file lines = f.readlines() f.close() numb = 0 for line in lines: # for each line, if the 11th column is lower or equal to specified threshold, then increment numb value = line.split(" ", 13) if len(value) == 11: if float(value[10]) <= float(thresh): numb += 1 return numb
def createFinalSets(poolfile, control, nt, np, outputdir, prefix): print "Creating conservative peak set..." tmp = getPrefix(poolfile) tmpctrl = getPrefix(control) filein = outputdir+"/PeakCalling/"+tmp+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz" fileout = outputdir+"/finalsets/"+prefix+"_spp_conservative.regionPeak.gz" command = "zcat "+filein+"|sort -k7nr,7nr|head -n "+str(nt)+"|gzip -c > "+fileout # get only the nt first peaks subprocess.call(command, shell=True) print "Done" print "Creating optimum peak set..." fileout = outputdir+"/finalsets/"+prefix+"_spp_optimum.regionPeak.gz" maxnum = max(nt, np) command = "zcat "+filein+"|sort -k7nr,7nr|head -n "+str(maxnum)+"|gzip -c > "+fileout # get only the max(nt, np) first peaks subprocess.call(command, shell=True) print "Done" return
def countConsistentPeaks( file1, file2, outputdir, thresh ): # count number of peaks with specified IDR (or lower in resulting file) tmp1 = getPrefix(file1) tmp2 = getPrefix(file2) file_name = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 + "-overlapped-peaks.txt" f = open(file_name, 'r') # open resulting file lines = f.readlines() f.close() numb = 0 for line in lines: # for each line, if the 11th column is lower or equal to specified threshold, then increment numb value = line.split(" ", 13) if len(value) == 11: if float(value[10]) <= float(thresh): numb += 1 return numb
def scanSeqMotif(sequencefile, motiffile, outputdir, prefix): motifname = getPrefix(motiffile) outfile = outputdir+"/"+prefix+"_"+motifname+".txt" createOdir(outputdir+"/"+prefix+"_"+motifname) command = "findMotifs.pl "+sequencefile+" fasta "+outputdir+"/"+prefix+"_"+motifname+"/ -find "+motiffile+" > "+outfile subprocess.call(command, shell=True) return outfile
def toTagAlign(file_in, outputdir): # convert given bam file to tagAlign file filename = getPrefix(file_in) tagAlignname = outputdir+filename+".tagAlign.gz" print "Transformation for file :"+file_in+"..." COMMAND = "samtools view -b "+file_in+"|bamToBed -i stdin|awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}{$4=\"N\"; print $0}'|gzip -c > "+tagAlignname subprocess.call(COMMAND, shell=True) print "Transformation achieved..." return
def createScoreFile(scanfile, scanbgfile, motiffile, outputdir, prefix): motifname = getPrefix(motiffile) outfile = outputdir + "/" + prefix + "_Score_" + motifname + ".txt" command1 = "cat " + scanfile + " | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"1}\' > " + outfile command2 = "cat " + scanbgfile + " | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"0}\' >> " + outfile subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) return outfile
def peakCall(file_in, control, outputdir): # Perform all peak calling with spp print "" print "Peak calling for :"+file_in+"..." outname = outputdir+getPrefix(file_in) command = "Rscript $RCHIPpipe_PATH/run_spp.R -c="+file_in+" -i="+control+" -npeak=300000 -odir="+outputdir+" -savr -savp -rf -out="+outname+"_stats.tab > "+outname+"_peakCalling.log" subprocess.call(command, shell=True) print "Done" return
def peakCall(file_in, control, outputdir): # Perform all peak calling with spp print "" print "Peak calling for :" + file_in + "..." outname = outputdir + getPrefix(file_in) command = "Rscript $RCHIPpipe_PATH/run_spp.R -c=" + file_in + " -i=" + control + " -npeak=300000 -odir=" + outputdir + " -savr -savp -rf -out=" + outname + "_stats.tab > " + outname + "_peakCalling.log" subprocess.call(command, shell=True) print "Done" return
def toTagAlign(file_in, outputdir): # convert given bam file to tagAlign file filename = getPrefix(file_in) tagAlignname = outputdir + filename + ".tagAlign.gz" print "Transformation for file :" + file_in + "..." COMMAND = "samtools view -b " + file_in + "|bamToBed -i stdin|awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}{$4=\"N\"; print $0}'|gzip -c > " + tagAlignname subprocess.call(COMMAND, shell=True) print "Transformation achieved..." return
def createScoreFile(scanfile, scanbgfile, motiffile, outputdir, prefix): motifname = getPrefix(motiffile) outfile = outputdir+"/"+prefix+"_Score_"+motifname+".txt" command1 = "cat "+scanfile+" | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"1}\' > "+outfile command2 = "cat "+scanbgfile+" | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"0}\' >> "+outfile subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) return outfile
def annotatePeaks(filein, outputdir, annofile, prefix): print "" print "searching for peaks in "+filein+" that fall into "+annofile+" region file..." tmp = getPrefix(annofile) command = "bedtools intersect -a "+filein+" -b "+annofile+" -wa -f 0.51 | sort -k1,1 -k2,2n | uniq > "+outputdir+"/"+prefix+"_"+tmp+".bed" subprocess.call(command, shell=True) print "Done" print "Annotated file can be found at :"+outputdir+"/"+prefix+"_"+tmp+".bed" return outputdir+"/"+prefix+"_"+tmp+".bed"
def splitFile(file_in, outputdir): print "" print "splitting file :"+file_in+"..." command1 = "gunzip "+file_in # unzip file subprocess.call(command1, shell=True) unzipname = outputdir+getPrefix(file_in)+".tagAlign" prprefix = outputdir+getPrefix(file_in) linecount = len(open(unzipname, 'rU').readlines()) # count number of lines in file linecount = linecount/2 # get half number of lines command2 = "gzip "+unzipname # zip file subprocess.call(command2, shell=True) command3 = "zcat "+file_in+"|shuf|split -d -l "+str(linecount)+" - "+prprefix # shuffle the file and then split it into two size equal files subprocess.call(command3, shell=True) command4 = "gzip "+prprefix+"00" # zip the two resulting files command5 = "gzip "+prprefix+"01" command6 = "mv "+prprefix+"00.gz "+prprefix+"_PR1.tagAlign.gz" # rename the two files command7 = "mv "+prprefix+"01.gz "+prprefix+"_PR2.tagAlign.gz" subprocess.call(command4, shell=True) subprocess.call(command5, shell=True) subprocess.call(command6, shell=True) subprocess.call(command7, shell=True) print file_in+" splitted..." return
def trimmoSe(fastqfile, outputdir, adapter): print "" tmp1 = getPrefix(fastqfile) command1 = "zcat "+fastqfile+" | head -n 2 | tail -n 1 > "+outputdir+"/temp.txt" subprocess.call(command1, shell=True) f = open(outputdir+"/temp.txt",'r') # open resulting file lines = f.readlines() f.close() for line in lines: readlength = len(line)-1 os.remove(outputdir+"/temp.txt") print "read length for this file: "+str(readlength) fileout = outputdir+"/fastq_trim/"+tmp1+"_trim.fastq.gz" command2 = "java -jar $RCHIPpipe_PATH/Trimmomatic-0.35/trimmomatic-0.35.jar SE -phred33 "+fastqfile+" "+fileout+" ILLUMINACLIP:$RCHIPpipe_PATH/Trimmomatic-0.35/adapters/"+adapter+":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:"+str(readlength) subprocess.call(command2, shell=True) return fileout
def trimmoSe(fastqfile, outputdir, adapter): print "" tmp1 = getPrefix(fastqfile) command1 = "zcat " + fastqfile + " | head -n 2 | tail -n 1 > " + outputdir + "/temp.txt" subprocess.call(command1, shell=True) f = open(outputdir + "/temp.txt", 'r') # open resulting file lines = f.readlines() f.close() for line in lines: readlength = len(line) - 1 os.remove(outputdir + "/temp.txt") print "read length for this file: " + str(readlength) fileout = outputdir + "/fastq_trim/" + tmp1 + "_trim.fastq.gz" command2 = "java -jar $RCHIPpipe_PATH/Trimmomatic-0.35/trimmomatic-0.35.jar SE -phred33 " + fastqfile + " " + fileout + " ILLUMINACLIP:$RCHIPpipe_PATH/Trimmomatic-0.35/adapters/" + adapter + ":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:" + str( readlength) subprocess.call(command2, shell=True) return fileout
def plotResults(outputdir, rep1, rep2, rep1pr1, rep1pr2, rep2pr1, rep2pr2, poolpr1, poolpr2): print "Plotting IDR results for " + rep1 + " and " + rep2 + "..." tmp1 = getPrefix(rep1) tmp2 = getPrefix(rep2) fileout = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2 filename = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 " + fileout + " " + filename + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r" # call R scripts to plot results command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename + "-overlapped-peaks.txt -o=" + fileout subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) print "Done" print "Plotting IDR results for pseudo-replicates...." tmp1 = getPrefix(rep1pr1) tmp2 = getPrefix(rep1pr2) tmp3 = getPrefix(rep2pr1) tmp4 = getPrefix(rep2pr2) fileout = outputdir + "/IDR/plots/pseudo-replicates" fileout1 = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2 fileout2 = outputdir + "/IDR/plots/" + tmp3 + "_VS_" + tmp4 filename1 = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 filename2 = outputdir + "/IDR/" + tmp3 + "_VS_" + tmp4 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 2 " + fileout + " " + filename1 + " " + filename2 + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r" command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename1 + "-overlapped-peaks.txt -o=" + fileout1 command3 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename1 + "-overlapped-peaks.txt -o=" + fileout2 subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) subprocess.call(command3, shell=True) print "Done" print "Plotting IDR results for pool pseudo-replicates..." tmp1 = getPrefix(poolpr1) tmp2 = getPrefix(poolpr2) fileout = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2 filename = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 " + fileout + " " + filename + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r" command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename + "-overlapped-peaks.txt -o=" + fileout subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) print "Done" return
def plotResults(outputdir, rep1, rep2, rep1pr1, rep1pr2, rep2pr1, rep2pr2, poolpr1, poolpr2): print "Plotting IDR results for "+rep1+" and "+rep2+"..." tmp1 = getPrefix(rep1) tmp2 = getPrefix(rep2) fileout = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2 filename = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 "+fileout+" "+filename+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r" # call R scripts to plot results command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename+"-overlapped-peaks.txt -o="+fileout subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) print "Done" print "Plotting IDR results for pseudo-replicates...." tmp1 = getPrefix(rep1pr1) tmp2 = getPrefix(rep1pr2) tmp3 = getPrefix(rep2pr1) tmp4 = getPrefix(rep2pr2) fileout = outputdir+"/IDR/plots/pseudo-replicates" fileout1 = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2 fileout2 = outputdir+"/IDR/plots/"+tmp3+"_VS_"+tmp4 filename1 = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 filename2 = outputdir+"/IDR/"+tmp3+"_VS_"+tmp4 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 2 "+fileout+" "+filename1+" "+filename2+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r" command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename1+"-overlapped-peaks.txt -o="+fileout1 command3 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename1+"-overlapped-peaks.txt -o="+fileout2 subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) subprocess.call(command3, shell=True) print "Done" print "Plotting IDR results for pool pseudo-replicates..." tmp1 = getPrefix(poolpr1) tmp2 = getPrefix(poolpr2) fileout = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2 filename = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 "+fileout+" "+filename+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r" command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename+"-overlapped-peaks.txt -o="+fileout subprocess.call(command1, shell=True) subprocess.call(command2, shell=True) print "Done" return
def fastQc(fastqfile, outputdir): print "running fastqc tool for : " + fastqfile + "..." prefix = getPrefix(fastqfile) command = "fastqc -o " + outputdir + "/fastqc_report/ " + fastqfile subprocess.call(command, shell=True) f = open(outputdir + "/fastqc_report/" + prefix + "_fastqc/summary.txt", 'r') # open resulting file lines = f.readlines() f.close() find = False nbwarn = 0 for line in lines: # for each line, if the 1st column is FAIL, then print which test is failed value = line.split("\t", 4) if value[0] == 'FAIL': find = True print value[1] + " have failed quality check" elif value[0] == 'WARN': nbwarn += 1 if find == False: print "Any test failed quality check" print str(nbwarn) + " tests present warnings" print "Done" return
def scoreAUC(scorefile, outputdir): aucfile = getPrefix(scorefile) outfile = outputdir+"/"+aucfile+"_AUC.txt" command = "Rscript $RCHIPpipe_PATH/Calculate_AUC.r -i="+scorefile+" "+outfile subprocess.call(command, shell=True) return
def sortScoreFile(scorefile, outputdir): filename = getPrefix(scorefile) outfile = outputdir + "/" + filename + "_2.txt" command = "sort -k1,1nr " + scorefile + " > " + outfile subprocess.call(command, shell=True) return outfile
def newName(file_in, outputdir): # get new tagAlign file name fileprefix = getPrefix(file_in) newname = outputdir + fileprefix + ".tagAlign.gz" return newname
def plotRocCurve(scorefile, outputdir): rocfile = getPrefix(scorefile) outfile = outputdir+"/"+rocfile+"_ROC.png" command = "Rscript $RCHIPpipe_PATH/produceROC.r -i="+scorefile+" "+outfile subprocess.call(command, shell=True) return
def scoreAUC(scorefile, outputdir): aucfile = getPrefix(scorefile) outfile = outputdir + "/" + aucfile + "_AUC.txt" command = "Rscript $RCHIPpipe_PATH/Calculate_AUC.r -i=" + scorefile + " " + outfile subprocess.call(command, shell=True) return
def mainCp(argv): if len(argv) == 1: # if any arguments are given print usage message and then exit the programm usageCp() sys.exit(1) outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = initParamCp() # initialize paramters to default outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = readOptCp(argv[1:], outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix) # read option from command line and changes parameters if necessary checkRequiredCp(rep1, rep2, ctrl1) # Check if the required parameters have been specified if selectodir == 'false': # if no output directory specified, create one folder in current directory createOdir(outputdir) welcomeCp() # print welcome message if prefix == '': # if no prefix is given in the command line, give a default prefix prefix = 'CallPeaks' parametersCp(outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix) # print a summary off all parameters used running() # print running message print "" print "Step1 : Transformation from bam file to tagAlign file..." createOdir(outputdir+"/tagAlignfiles") # create new folder to put transformed files for file_in in (rep1, rep2, ctrl1): toTagAlign(file_in, outputdir+"/tagAlignfiles/") # convert bam files to tagAlign files if ctrlsup == 'true': # if there is a second control file, convert it too toTagAlign(ctrl2, outputdir+"/tagAlignfiles/") ctrl2 = newName(ctrl2, outputdir+"/tagAlignfiles/") # rename file variables rep1 = newName(rep1, outputdir+"/tagAlignfiles/") rep2 = newName(rep2, outputdir+"/tagAlignfiles/") ctrl1 = newName(ctrl1, outputdir+"/tagAlignfiles/") print "Step1 : Transformation from bam file to tagAlign file achieved..." print "" print "Step2 : Merging Control files (if two files are given)..." if ctrlsup == 'true': # if there is two control files , merge them as one ctrlfile = mergeFile(ctrl1, ctrl2, outputdir+"/tagAlignfiles/", 'Control') print "Step2 : Merging Control files achieved..." else: print "skipped" ctrlfile = ctrl1 print "" print "Step3 : Creating Pool of replicates..." poolfile = mergeFile(rep1, rep2, outputdir+"/tagAlignfiles/", 'Pool') # merge two sample files to create pool print "Step3 : Creating Pool of replicates achieved..." print "" print "Step4 : Splitting samples files into pseudo replicates..." for file_in in (rep1, rep2, poolfile): # for each file given, split randomly into two pseudo replicates splitFile(file_in, outputdir+"/tagAlignfiles/") print "Step4 : Splitting samples files into pseudo replicates achieved..." print "" prefixr1 = getPrefix(rep1) prefixr2 = getPrefix(rep2) prefixpool = getPrefix(poolfile) prefixr1 = outputdir+"/tagAlignfiles/"+prefixr1 prefixr2 = outputdir+"/tagAlignfiles/"+prefixr2 prefixpool = outputdir+"/tagAlignfiles/"+prefixpool print "Step5 : Peak Calling for each files (replicates, pool and pseudo replicates)..." createOdir(outputdir+"/PeakCalling") # create new folder to put peak calling output for i in (rep1, rep2, poolfile, prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz"): peakCall(i, ctrlfile, outputdir+"/PeakCalling/") # for each files given, perform peak calling with spp print "Step5 : Peak Calling for each files achieved..." print "" print "Step6 : IDR analysis..." if idr == 'OFF': print "Skipped" else: # if idr analysis is selected createOdir(outputdir+"/IDR") # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates consistency(rep1, rep2, ctrlfile, outputdir) consistency(prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", ctrlfile, outputdir) consistency(prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", ctrlfile, outputdir) consistency(prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz", ctrlfile, outputdir) nt = countConsistentPeaks(rep1, rep2, outputdir, idrthresh) # get number of peaks with IDR lower than specified threshold for each IDR output file np = countConsistentPeaks(prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz", outputdir, idrthresh) n1 = countConsistentPeaks(prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", outputdir , idrthresh) n2 = countConsistentPeaks(prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", outputdir , idrthresh) exportResults(nt, np, n1, n2, outputdir) # export metrics in a tab delimited file print "Step6 : IDR analysis achieved..." print "" print "Step7 : Plotting IDR results..." if plot == 'OFF': print "Skipped" else: # if no-plot option is not selected createOdir(outputdir+"/IDR/plots") # create new folder to put IDR plot, then, create the plots for each IDR output files plotResults(outputdir, rep1, rep2, prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz") print "Step7 : Plotting IDR results achieved..." print "" print "Step8 : Creating final sets of peaks..." if finalsets == 'OFF': print "Skipped" else: # if final peak sets creation is asked createOdir(outputdir+"/finalsets") # create new folder to put final peak sets, then, create final peak sets createFinalSets(poolfile , ctrlfile, nt, np, outputdir, prefix) print "Step8 : Creating final sets of peaks achieved..." goodbyeCp() # print end message and then exit return
def mainCpnr(argv): if len(argv) == 1: # if any arguments are given print usage message and then exit the programm usageCpnr() sys.exit(1) outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = initParamCpnr() # intialize to default all parameters outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = readOptCpnr(argv[1:], outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp ) # read option on command line and changes parameters if necessary checkRequiredCpnr(bamfile, ctrlfile) # check if the required options have been specified if selectodir == 'false': # If no output directory specified, create one folder in current directory createOdir(outputdir) welcomeCpnr() # print welcome message if prefix == '': # If no prefix is given in the command line, give a default prefix prefix = 'CallPeaks_norep' parametersCpnr(outputdir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp) # print a summary of all parameters used running() # print running message print "" print "Step1 : Transformation from bam file to tagAlign file..." createOdir(outputdir+"/tagAlignfiles") # create new folder to put all tagAlign files for file_in in (bamfile, ctrlfile): # convert all given files from bam to tagAlign toTagAlign(file_in, outputdir+"/tagAlignfiles/") chipfile = newName(bamfile, outputdir+"/tagAlignfiles/") # rename the files ctrlfile = newName(ctrlfile, outputdir+"/tagAlignfiles/") print "Step1 : Transformation from bam file to tagAlign file achieved..." print "" print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..." createOdir(outputdir+"/PeakCalling") if qc == 'ON' and spp == 'OFF': # Cross-correlation analysis is asked qualCheck(chipfile, outputdir+"/PeakCalling", prefix) print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..." print "" else: print "skipped" print "" print "Step3 : PeakCalling using macs2..." if spp == 'OFF': peakCallMacs(chipfile, ctrlfile, outputdir, prefix, pvalue, qvalue, thresh) print "Step3 : PeakCalling using macs2 achieved..." print "" else: print "skipped" print "" if spp == 'ON': print "Step4 : PeakCalling based on adaptated IDR analysis..." print "\tStep4a : Splitting file into pseudo-replicates..." splitFile(chipfile, outputdir+"/tagAlignfiles/") print "\tStep4a : Splitting file into pseudo-replicates achieved..." prefixchip = getPrefix(chipfile) prefixchip = outputdir+"/tagAlignfiles/"+prefixchip print "Step4b : Peak Calling for each files (replicates and pseudo replicates)..." createOdir(outputdir+"/PeakCalling") # create new folder to put peak calling output for i in (chipfile, prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz"): peakCall(i, ctrlfile, outputdir+"/PeakCalling/") # for each files given, perform peak calling with spp print "Step4b : Peak Calling for each files (replicates and pseudo replicates) achieved..." print "Step4c : IDR analysis..." createOdir(outputdir+"/IDR") # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates consistency(prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz", ctrlfile, outputdir) idrthresh = 0.01 np = countConsistentPeaks(prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz", outputdir, idrthresh) print "number of consistent peaks between the two pseudo replicates: "+str(np) print "Step4c : IDR analysis achieved..." print "Step4d : Plotting IDR results..." createOdir(outputdir+"/IDR/plots") # create new folder to put IDR plot, then, create the plots for each IDR output files plotResults2(outputdir, prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz") print "Step4d : Plotting IDR results achieved..." print "Step5e : Creating final sets of peaks..." createOdir(outputdir+"/finalsets") # create new folder to put final peak sets, then, create final peak sets createFinalSets2(chipfile , ctrlfile, np, outputdir, prefix) goodbyeCp() # print end of analysis message and the exit return
def newName(file_in, outputdir): # get new tagAlign file name fileprefix = getPrefix(file_in) newname = outputdir+fileprefix+".tagAlign.gz" return newname
def mainCp(argv): if len( argv ) == 1: # if any arguments are given print usage message and then exit the programm usageCp() sys.exit(1) outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = initParamCp( ) # initialize paramters to default outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = readOptCp( argv[1:], outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix ) # read option from command line and changes parameters if necessary checkRequiredCp( rep1, rep2, ctrl1) # Check if the required parameters have been specified if selectodir == 'false': # if no output directory specified, create one folder in current directory createOdir(outputdir) welcomeCp() # print welcome message if prefix == '': # if no prefix is given in the command line, give a default prefix prefix = 'CallPeaks' parametersCp(outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix) # print a summary off all parameters used running() # print running message print "" print "Step1 : Transformation from bam file to tagAlign file..." createOdir(outputdir + "/tagAlignfiles") # create new folder to put transformed files for file_in in (rep1, rep2, ctrl1): toTagAlign(file_in, outputdir + "/tagAlignfiles/") # convert bam files to tagAlign files if ctrlsup == 'true': # if there is a second control file, convert it too toTagAlign(ctrl2, outputdir + "/tagAlignfiles/") ctrl2 = newName(ctrl2, outputdir + "/tagAlignfiles/") # rename file variables rep1 = newName(rep1, outputdir + "/tagAlignfiles/") rep2 = newName(rep2, outputdir + "/tagAlignfiles/") ctrl1 = newName(ctrl1, outputdir + "/tagAlignfiles/") print "Step1 : Transformation from bam file to tagAlign file achieved..." print "" print "Step2 : Merging Control files (if two files are given)..." if ctrlsup == 'true': # if there is two control files , merge them as one ctrlfile = mergeFile(ctrl1, ctrl2, outputdir + "/tagAlignfiles/", 'Control') print "Step2 : Merging Control files achieved..." else: print "skipped" ctrlfile = ctrl1 print "" print "Step3 : Creating Pool of replicates..." poolfile = mergeFile(rep1, rep2, outputdir + "/tagAlignfiles/", 'Pool') # merge two sample files to create pool print "Step3 : Creating Pool of replicates achieved..." print "" print "Step4 : Splitting samples files into pseudo replicates..." for file_in in ( rep1, rep2, poolfile ): # for each file given, split randomly into two pseudo replicates splitFile(file_in, outputdir + "/tagAlignfiles/") print "Step4 : Splitting samples files into pseudo replicates achieved..." print "" prefixr1 = getPrefix(rep1) prefixr2 = getPrefix(rep2) prefixpool = getPrefix(poolfile) prefixr1 = outputdir + "/tagAlignfiles/" + prefixr1 prefixr2 = outputdir + "/tagAlignfiles/" + prefixr2 prefixpool = outputdir + "/tagAlignfiles/" + prefixpool print "Step5 : Peak Calling for each files (replicates, pool and pseudo replicates)..." createOdir(outputdir + "/PeakCalling") # create new folder to put peak calling output for i in (rep1, rep2, poolfile, prefixr1 + "_PR1.tagAlign.gz", prefixr1 + "_PR2.tagAlign.gz", prefixr2 + "_PR1.tagAlign.gz", prefixr2 + "_PR2.tagAlign.gz", prefixpool + "_PR1.tagAlign.gz", prefixpool + "_PR2.tagAlign.gz"): peakCall(i, ctrlfile, outputdir + "/PeakCalling/" ) # for each files given, perform peak calling with spp print "Step5 : Peak Calling for each files achieved..." print "" print "Step6 : IDR analysis..." if idr == 'OFF': print "Skipped" else: # if idr analysis is selected createOdir( outputdir + "/IDR" ) # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates consistency(rep1, rep2, ctrlfile, outputdir) consistency(prefixr1 + "_PR1.tagAlign.gz", prefixr1 + "_PR2.tagAlign.gz", ctrlfile, outputdir) consistency(prefixr2 + "_PR1.tagAlign.gz", prefixr2 + "_PR2.tagAlign.gz", ctrlfile, outputdir) consistency(prefixpool + "_PR1.tagAlign.gz", prefixpool + "_PR2.tagAlign.gz", ctrlfile, outputdir) nt = countConsistentPeaks( rep1, rep2, outputdir, idrthresh ) # get number of peaks with IDR lower than specified threshold for each IDR output file np = countConsistentPeaks(prefixpool + "_PR1.tagAlign.gz", prefixpool + "_PR2.tagAlign.gz", outputdir, idrthresh) n1 = countConsistentPeaks(prefixr1 + "_PR1.tagAlign.gz", prefixr1 + "_PR2.tagAlign.gz", outputdir, idrthresh) n2 = countConsistentPeaks(prefixr2 + "_PR1.tagAlign.gz", prefixr2 + "_PR2.tagAlign.gz", outputdir, idrthresh) exportResults(nt, np, n1, n2, outputdir) # export metrics in a tab delimited file print "Step6 : IDR analysis achieved..." print "" print "Step7 : Plotting IDR results..." if plot == 'OFF': print "Skipped" else: # if no-plot option is not selected createOdir( outputdir + "/IDR/plots" ) # create new folder to put IDR plot, then, create the plots for each IDR output files plotResults(outputdir, rep1, rep2, prefixr1 + "_PR1.tagAlign.gz", prefixr1 + "_PR2.tagAlign.gz", prefixr2 + "_PR1.tagAlign.gz", prefixr2 + "_PR2.tagAlign.gz", prefixpool + "_PR1.tagAlign.gz", prefixpool + "_PR2.tagAlign.gz") print "Step7 : Plotting IDR results achieved..." print "" print "Step8 : Creating final sets of peaks..." if finalsets == 'OFF': print "Skipped" else: # if final peak sets creation is asked createOdir( outputdir + "/finalsets" ) # create new folder to put final peak sets, then, create final peak sets createFinalSets(poolfile, ctrlfile, nt, np, outputdir, prefix) print "Step8 : Creating final sets of peaks achieved..." goodbyeCp() # print end message and then exit return
def mainCpnr(argv): if len( argv ) == 1: # if any arguments are given print usage message and then exit the programm usageCpnr() sys.exit(1) outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = initParamCpnr( ) # intialize to default all parameters outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = readOptCpnr( argv[1:], outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp) # read option on command line and changes parameters if necessary checkRequiredCpnr( bamfile, ctrlfile) # check if the required options have been specified if selectodir == 'false': # If no output directory specified, create one folder in current directory createOdir(outputdir) welcomeCpnr() # print welcome message if prefix == '': # If no prefix is given in the command line, give a default prefix prefix = 'CallPeaks_norep' parametersCpnr(outputdir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp) # print a summary of all parameters used running() # print running message print "" print "Step1 : Transformation from bam file to tagAlign file..." createOdir(outputdir + "/tagAlignfiles") # create new folder to put all tagAlign files for file_in in (bamfile, ctrlfile): # convert all given files from bam to tagAlign toTagAlign(file_in, outputdir + "/tagAlignfiles/") chipfile = newName(bamfile, outputdir + "/tagAlignfiles/") # rename the files ctrlfile = newName(ctrlfile, outputdir + "/tagAlignfiles/") print "Step1 : Transformation from bam file to tagAlign file achieved..." print "" print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..." createOdir(outputdir + "/PeakCalling") if qc == 'ON' and spp == 'OFF': # Cross-correlation analysis is asked qualCheck(chipfile, outputdir + "/PeakCalling", prefix) print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..." print "" else: print "skipped" print "" print "Step3 : PeakCalling using macs2..." if spp == 'OFF': peakCallMacs(chipfile, ctrlfile, outputdir, prefix, pvalue, qvalue, thresh) print "Step3 : PeakCalling using macs2 achieved..." print "" else: print "skipped" print "" if spp == 'ON': print "Step4 : PeakCalling based on adaptated IDR analysis..." print "\tStep4a : Splitting file into pseudo-replicates..." splitFile(chipfile, outputdir + "/tagAlignfiles/") print "\tStep4a : Splitting file into pseudo-replicates achieved..." prefixchip = getPrefix(chipfile) prefixchip = outputdir + "/tagAlignfiles/" + prefixchip print "Step4b : Peak Calling for each files (replicates and pseudo replicates)..." createOdir( outputdir + "/PeakCalling") # create new folder to put peak calling output for i in (chipfile, prefixchip + "_PR1.tagAlign.gz", prefixchip + "_PR2.tagAlign.gz"): peakCall(i, ctrlfile, outputdir + "/PeakCalling/" ) # for each files given, perform peak calling with spp print "Step4b : Peak Calling for each files (replicates and pseudo replicates) achieved..." print "Step4c : IDR analysis..." createOdir( outputdir + "/IDR" ) # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates consistency(prefixchip + "_PR1.tagAlign.gz", prefixchip + "_PR2.tagAlign.gz", ctrlfile, outputdir) idrthresh = 0.01 np = countConsistentPeaks(prefixchip + "_PR1.tagAlign.gz", prefixchip + "_PR2.tagAlign.gz", outputdir, idrthresh) print "number of consistent peaks between the two pseudo replicates: " + str( np) print "Step4c : IDR analysis achieved..." print "Step4d : Plotting IDR results..." createOdir( outputdir + "/IDR/plots" ) # create new folder to put IDR plot, then, create the plots for each IDR output files plotResults2(outputdir, prefixchip + "_PR1.tagAlign.gz", prefixchip + "_PR2.tagAlign.gz") print "Step4d : Plotting IDR results achieved..." print "Step5e : Creating final sets of peaks..." createOdir( outputdir + "/finalsets" ) # create new folder to put final peak sets, then, create final peak sets createFinalSets2(chipfile, ctrlfile, np, outputdir, prefix) goodbyeCp() # print end of analysis message and the exit return
def plotRocCurve(scorefile, outputdir): rocfile = getPrefix(scorefile) outfile = outputdir + "/" + rocfile + "_ROC.png" command = "Rscript $RCHIPpipe_PATH/produceROC.r -i=" + scorefile + " " + outfile subprocess.call(command, shell=True) return