Beispiel #1
0
def splitFile(file_in, outputdir):
    print ""
    print "splitting file :" + file_in + "..."
    command1 = "gunzip " + file_in  # unzip file
    subprocess.call(command1, shell=True)
    unzipname = outputdir + getPrefix(file_in) + ".tagAlign"
    prprefix = outputdir + getPrefix(file_in)
    linecount = len(open(unzipname,
                         'rU').readlines())  # count number of lines in file
    linecount = linecount / 2  # get half number of lines
    command2 = "gzip " + unzipname  # zip file
    subprocess.call(command2, shell=True)
    command3 = "zcat " + file_in + "|shuf|split -d -l " + str(
        linecount
    ) + " - " + prprefix  # shuffle the file and then split it into two size equal files
    subprocess.call(command3, shell=True)
    command4 = "gzip " + prprefix + "00"  # zip the two resulting files
    command5 = "gzip " + prprefix + "01"
    command6 = "mv " + prprefix + "00.gz " + prprefix + "_PR1.tagAlign.gz"  # rename the two files
    command7 = "mv " + prprefix + "01.gz " + prprefix + "_PR2.tagAlign.gz"
    subprocess.call(command4, shell=True)
    subprocess.call(command5, shell=True)
    subprocess.call(command6, shell=True)
    subprocess.call(command7, shell=True)
    print file_in + " splitted..."
    return
Beispiel #2
0
def createFinalSets2(chipfile, control, np, outputdir, prefix):
	print "Creating peak set..."
	tmp = getPrefix(chipfile)
	tmpctrl = getPrefix(control)
	filein = outputdir+"/PeakCalling/"+tmp+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz"
	fileout = outputdir+"/finalsets/"+prefix+"_spp.regionPeak.gz"
	command = "zcat "+filein+"|sort -k7nr,7nr|head -n "+str(np)+"|gzip -c > "+fileout # get only the np first peaks
	subprocess.call(command, shell=True)
	print "Done"
	return
Beispiel #3
0
def plotResults2(outputdir, rep1, rep2):
	print "Plotting IDR results for "+rep1+" and "+rep2+"..."
	tmp1 = getPrefix(rep1)
	tmp2 = getPrefix(rep2)
	fileout = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2
	filename = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 
	command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 "+fileout+" "+filename+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r" # call R scripts to plot results
	command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename+"-overlapped-peaks.txt -o="+fileout
	subprocess.call(command1, shell=True)
	subprocess.call(command2, shell=True)
	print "Done"
	return
Beispiel #4
0
def plotResults2(outputdir, rep1, rep2):
    print "Plotting IDR results for " + rep1 + " and " + rep2 + "..."
    tmp1 = getPrefix(rep1)
    tmp2 = getPrefix(rep2)
    fileout = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2
    filename = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2
    command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 " + fileout + " " + filename + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r"  # call R scripts to plot results
    command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename + "-overlapped-peaks.txt -o=" + fileout
    subprocess.call(command1, shell=True)
    subprocess.call(command2, shell=True)
    print "Done"
    return
Beispiel #5
0
def consistency(file1, file2, control, outputdir):  # perform IDR analysis
    print ""
    print "IDR analysis for :" + file1 + " vs " + file2 + "..."
    tmp1 = getPrefix(file1)
    tmp2 = getPrefix(file2)
    tmpctrl = getPrefix(control)
    file_in1 = outputdir + "/PeakCalling/" + tmp1 + ".tagAlign_VS_" + tmpctrl + ".tagAlign.regionPeak.gz"
    file_in2 = outputdir + "/PeakCalling/" + tmp2 + ".tagAlign_VS_" + tmpctrl + ".tagAlign.regionPeak.gz"
    outname = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2
    command = "Rscript $RCHIPpipe_PATH/batch-consistency-analysis.r " + file_in1 + " " + file_in2 + " -1 " + outname + " 0 F signal.value $RCHIPpipe_PATH/functions-all-clayton-12-13.r $RCHIPpipe_PATH/genome_table.txt"
    subprocess.call(command, shell=True)
    print "Done"
    return
Beispiel #6
0
def consistency(file1, file2, control, outputdir): # perform IDR analysis
	print ""
	print "IDR analysis for :"+file1+" vs "+file2+"..."
	tmp1 = getPrefix(file1)
	tmp2 = getPrefix(file2)
	tmpctrl = getPrefix(control)
	file_in1 = outputdir+"/PeakCalling/"+tmp1+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz"
	file_in2 = outputdir+"/PeakCalling/"+tmp2+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz"
	outname = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 
	command = "Rscript $RCHIPpipe_PATH/batch-consistency-analysis.r "+file_in1+" "+file_in2+" -1 "+outname+" 0 F signal.value $RCHIPpipe_PATH/functions-all-clayton-12-13.r $RCHIPpipe_PATH/genome_table.txt"
	subprocess.call(command, shell=True)
	print "Done"
	return
Beispiel #7
0
def countConsistentPeaks(file1, file2, outputdir, thresh): # count number of peaks with specified IDR (or lower in resulting file)
	tmp1 = getPrefix(file1)
	tmp2 = getPrefix(file2)
	file_name = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2+"-overlapped-peaks.txt"
	f = open(file_name,'r') # open resulting file
	lines  = f.readlines()
	f.close()
	numb = 0
	for line in lines: # for each line, if the 11th column is lower or equal to specified threshold, then increment numb
		value = line.split(" ", 13)
		if len(value) == 11:
			if float(value[10]) <= float(thresh):
				numb += 1
	return numb
Beispiel #8
0
def createFinalSets(poolfile, control, nt, np, outputdir, prefix):
	print "Creating conservative peak set..."
	tmp = getPrefix(poolfile)
	tmpctrl = getPrefix(control)
	filein = outputdir+"/PeakCalling/"+tmp+".tagAlign_VS_"+tmpctrl+".tagAlign.regionPeak.gz"
	fileout = outputdir+"/finalsets/"+prefix+"_spp_conservative.regionPeak.gz"
	command = "zcat "+filein+"|sort -k7nr,7nr|head -n "+str(nt)+"|gzip -c > "+fileout # get only the nt first peaks
	subprocess.call(command, shell=True)
	print "Done"
	print "Creating optimum peak set..."
	fileout = outputdir+"/finalsets/"+prefix+"_spp_optimum.regionPeak.gz"
	maxnum = max(nt, np)
	command = "zcat "+filein+"|sort -k7nr,7nr|head -n "+str(maxnum)+"|gzip -c > "+fileout # get only the max(nt, np) first peaks
	subprocess.call(command, shell=True)
	print "Done"
	return
Beispiel #9
0
def countConsistentPeaks(
    file1, file2, outputdir, thresh
):  # count number of peaks with specified IDR (or lower in resulting file)
    tmp1 = getPrefix(file1)
    tmp2 = getPrefix(file2)
    file_name = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2 + "-overlapped-peaks.txt"
    f = open(file_name, 'r')  # open resulting file
    lines = f.readlines()
    f.close()
    numb = 0
    for line in lines:  # for each line, if the 11th column is lower or equal to specified threshold, then increment numb
        value = line.split(" ", 13)
        if len(value) == 11:
            if float(value[10]) <= float(thresh):
                numb += 1
    return numb
Beispiel #10
0
def scanSeqMotif(sequencefile, motiffile, outputdir, prefix):
	motifname = getPrefix(motiffile)
	outfile = outputdir+"/"+prefix+"_"+motifname+".txt"
	createOdir(outputdir+"/"+prefix+"_"+motifname)
	command = "findMotifs.pl "+sequencefile+" fasta "+outputdir+"/"+prefix+"_"+motifname+"/ -find "+motiffile+" > "+outfile
	subprocess.call(command, shell=True)
	return outfile
Beispiel #11
0
def toTagAlign(file_in, outputdir): # convert given bam file to tagAlign file
	filename = getPrefix(file_in)
	tagAlignname = outputdir+filename+".tagAlign.gz"
	print "Transformation for file :"+file_in+"..."
	COMMAND = "samtools view -b "+file_in+"|bamToBed -i stdin|awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}{$4=\"N\"; print $0}'|gzip -c > "+tagAlignname
	subprocess.call(COMMAND, shell=True)
	print "Transformation achieved..."
	return
Beispiel #12
0
def createScoreFile(scanfile, scanbgfile, motiffile, outputdir, prefix):
    motifname = getPrefix(motiffile)
    outfile = outputdir + "/" + prefix + "_Score_" + motifname + ".txt"
    command1 = "cat " + scanfile + " | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"1}\' > " + outfile
    command2 = "cat " + scanbgfile + " | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"0}\' >> " + outfile
    subprocess.call(command1, shell=True)
    subprocess.call(command2, shell=True)
    return outfile
Beispiel #13
0
def peakCall(file_in, control, outputdir): # Perform all peak calling with spp
	print ""
	print "Peak calling for :"+file_in+"..."
	outname = outputdir+getPrefix(file_in)
	command = "Rscript $RCHIPpipe_PATH/run_spp.R -c="+file_in+" -i="+control+" -npeak=300000 -odir="+outputdir+" -savr -savp -rf -out="+outname+"_stats.tab > "+outname+"_peakCalling.log"
	subprocess.call(command, shell=True)
	print "Done"
	return
Beispiel #14
0
def peakCall(file_in, control, outputdir):  # Perform all peak calling with spp
    print ""
    print "Peak calling for :" + file_in + "..."
    outname = outputdir + getPrefix(file_in)
    command = "Rscript $RCHIPpipe_PATH/run_spp.R -c=" + file_in + " -i=" + control + " -npeak=300000 -odir=" + outputdir + " -savr -savp -rf -out=" + outname + "_stats.tab > " + outname + "_peakCalling.log"
    subprocess.call(command, shell=True)
    print "Done"
    return
Beispiel #15
0
def toTagAlign(file_in, outputdir):  # convert given bam file to tagAlign file
    filename = getPrefix(file_in)
    tagAlignname = outputdir + filename + ".tagAlign.gz"
    print "Transformation for file :" + file_in + "..."
    COMMAND = "samtools view -b " + file_in + "|bamToBed -i stdin|awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}{$4=\"N\"; print $0}'|gzip -c > " + tagAlignname
    subprocess.call(COMMAND, shell=True)
    print "Transformation achieved..."
    return
Beispiel #16
0
def createScoreFile(scanfile, scanbgfile, motiffile, outputdir, prefix):
	motifname = getPrefix(motiffile)
	outfile = outputdir+"/"+prefix+"_Score_"+motifname+".txt"
	command1 = "cat "+scanfile+" | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"1}\' > "+outfile
	command2 = "cat "+scanbgfile+" | grep \"^chr\" | cut -f 6 | awk \'{print $0\"\t\"0}\' >> "+outfile
	subprocess.call(command1, shell=True)
	subprocess.call(command2, shell=True)
	return outfile
	
Beispiel #17
0
def annotatePeaks(filein, outputdir, annofile, prefix):
	print ""
	print "searching for peaks in "+filein+" that fall into "+annofile+" region file..."
	tmp = getPrefix(annofile)
	command = "bedtools intersect -a "+filein+" -b "+annofile+" -wa -f 0.51 | sort -k1,1 -k2,2n | uniq > "+outputdir+"/"+prefix+"_"+tmp+".bed"
	subprocess.call(command, shell=True)
	print "Done"
	print "Annotated file can be found at :"+outputdir+"/"+prefix+"_"+tmp+".bed"
	return outputdir+"/"+prefix+"_"+tmp+".bed"
Beispiel #18
0
def splitFile(file_in, outputdir):
	print ""
	print "splitting file :"+file_in+"..."
	command1 = "gunzip "+file_in # unzip file
	subprocess.call(command1, shell=True)
	unzipname = outputdir+getPrefix(file_in)+".tagAlign"
	prprefix = outputdir+getPrefix(file_in)
	linecount = len(open(unzipname, 'rU').readlines()) # count number of lines in file
	linecount = linecount/2 # get half number of lines
	command2 = "gzip "+unzipname # zip file
	subprocess.call(command2, shell=True)
	command3 = "zcat "+file_in+"|shuf|split -d -l "+str(linecount)+" - "+prprefix # shuffle the file and then split it into two size equal files
	subprocess.call(command3, shell=True)
	command4 = "gzip "+prprefix+"00" # zip the two resulting files
	command5 = "gzip "+prprefix+"01"
	command6 = "mv "+prprefix+"00.gz "+prprefix+"_PR1.tagAlign.gz" # rename the two files
	command7 = "mv "+prprefix+"01.gz "+prprefix+"_PR2.tagAlign.gz"
	subprocess.call(command4, shell=True)
	subprocess.call(command5, shell=True)
	subprocess.call(command6, shell=True)
	subprocess.call(command7, shell=True)
	print file_in+" splitted..."
	return
Beispiel #19
0
def trimmoSe(fastqfile, outputdir, adapter):
	print ""
	tmp1 = getPrefix(fastqfile)
	command1 = "zcat "+fastqfile+" | head -n 2 | tail -n 1 > "+outputdir+"/temp.txt"
	subprocess.call(command1, shell=True)
	f = open(outputdir+"/temp.txt",'r') # open resulting file
	lines  = f.readlines()
	f.close()
	for line in lines:
		readlength = len(line)-1
	os.remove(outputdir+"/temp.txt")
	print "read length for this file: "+str(readlength)
	fileout = outputdir+"/fastq_trim/"+tmp1+"_trim.fastq.gz"
	command2 = "java -jar $RCHIPpipe_PATH/Trimmomatic-0.35/trimmomatic-0.35.jar SE -phred33 "+fastqfile+" "+fileout+" ILLUMINACLIP:$RCHIPpipe_PATH/Trimmomatic-0.35/adapters/"+adapter+":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:"+str(readlength)
	subprocess.call(command2, shell=True)
	return fileout
Beispiel #20
0
def trimmoSe(fastqfile, outputdir, adapter):
    print ""
    tmp1 = getPrefix(fastqfile)
    command1 = "zcat " + fastqfile + " | head -n 2 | tail -n 1 > " + outputdir + "/temp.txt"
    subprocess.call(command1, shell=True)
    f = open(outputdir + "/temp.txt", 'r')  # open resulting file
    lines = f.readlines()
    f.close()
    for line in lines:
        readlength = len(line) - 1
    os.remove(outputdir + "/temp.txt")
    print "read length for this file: " + str(readlength)
    fileout = outputdir + "/fastq_trim/" + tmp1 + "_trim.fastq.gz"
    command2 = "java -jar $RCHIPpipe_PATH/Trimmomatic-0.35/trimmomatic-0.35.jar SE -phred33 " + fastqfile + " " + fileout + " ILLUMINACLIP:$RCHIPpipe_PATH/Trimmomatic-0.35/adapters/" + adapter + ":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:" + str(
        readlength)
    subprocess.call(command2, shell=True)
    return fileout
Beispiel #21
0
def plotResults(outputdir, rep1, rep2, rep1pr1, rep1pr2, rep2pr1, rep2pr2,
                poolpr1, poolpr2):
    print "Plotting IDR results for " + rep1 + " and " + rep2 + "..."
    tmp1 = getPrefix(rep1)
    tmp2 = getPrefix(rep2)
    fileout = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2
    filename = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2
    command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 " + fileout + " " + filename + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r"  # call R scripts to plot results
    command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename + "-overlapped-peaks.txt -o=" + fileout
    subprocess.call(command1, shell=True)
    subprocess.call(command2, shell=True)
    print "Done"
    print "Plotting IDR results for pseudo-replicates...."
    tmp1 = getPrefix(rep1pr1)
    tmp2 = getPrefix(rep1pr2)
    tmp3 = getPrefix(rep2pr1)
    tmp4 = getPrefix(rep2pr2)
    fileout = outputdir + "/IDR/plots/pseudo-replicates"
    fileout1 = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2
    fileout2 = outputdir + "/IDR/plots/" + tmp3 + "_VS_" + tmp4
    filename1 = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2
    filename2 = outputdir + "/IDR/" + tmp3 + "_VS_" + tmp4
    command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 2 " + fileout + " " + filename1 + " " + filename2 + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r"
    command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename1 + "-overlapped-peaks.txt -o=" + fileout1
    command3 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename1 + "-overlapped-peaks.txt -o=" + fileout2
    subprocess.call(command1, shell=True)
    subprocess.call(command2, shell=True)
    subprocess.call(command3, shell=True)
    print "Done"
    print "Plotting IDR results for pool pseudo-replicates..."
    tmp1 = getPrefix(poolpr1)
    tmp2 = getPrefix(poolpr2)
    fileout = outputdir + "/IDR/plots/" + tmp1 + "_VS_" + tmp2
    filename = outputdir + "/IDR/" + tmp1 + "_VS_" + tmp2
    command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 " + fileout + " " + filename + " $RCHIPpipe_PATH/functions-all-clayton-12-13.r"
    command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r=" + filename + "-overlapped-peaks.txt -o=" + fileout
    subprocess.call(command1, shell=True)
    subprocess.call(command2, shell=True)
    print "Done"
    return
Beispiel #22
0
def plotResults(outputdir, rep1, rep2, rep1pr1, rep1pr2, rep2pr1, rep2pr2, poolpr1, poolpr2):
	print "Plotting IDR results for "+rep1+" and "+rep2+"..."
	tmp1 = getPrefix(rep1)
	tmp2 = getPrefix(rep2)
	fileout = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2
	filename = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2 
	command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 "+fileout+" "+filename+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r" # call R scripts to plot results
	command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename+"-overlapped-peaks.txt -o="+fileout
	subprocess.call(command1, shell=True)
	subprocess.call(command2, shell=True)
	print "Done"
	print "Plotting IDR results for pseudo-replicates...."
	tmp1 = getPrefix(rep1pr1)
	tmp2 = getPrefix(rep1pr2)
	tmp3 = getPrefix(rep2pr1)
	tmp4 = getPrefix(rep2pr2)
	fileout = outputdir+"/IDR/plots/pseudo-replicates"
	fileout1 = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2
	fileout2 = outputdir+"/IDR/plots/"+tmp3+"_VS_"+tmp4
	filename1 = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2
	filename2 = outputdir+"/IDR/"+tmp3+"_VS_"+tmp4
	command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 2 "+fileout+" "+filename1+" "+filename2+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r"
	command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename1+"-overlapped-peaks.txt -o="+fileout1
	command3 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename1+"-overlapped-peaks.txt -o="+fileout2
	subprocess.call(command1, shell=True)
	subprocess.call(command2, shell=True)
	subprocess.call(command3, shell=True)
	print "Done"
	print "Plotting IDR results for pool pseudo-replicates..."
	tmp1 = getPrefix(poolpr1)
	tmp2 = getPrefix(poolpr2)
	fileout = outputdir+"/IDR/plots/"+tmp1+"_VS_"+tmp2
	filename = outputdir+"/IDR/"+tmp1+"_VS_"+tmp2
	command1 = "Rscript $RCHIPpipe_PATH/batch-consistency-plot.r 1 "+fileout+" "+filename+" $RCHIPpipe_PATH/functions-all-clayton-12-13.r"
	command2 = "Rscript $RCHIPpipe_PATH/IDR_results_plots.r -r="+filename+"-overlapped-peaks.txt -o="+fileout
	subprocess.call(command1, shell=True)
	subprocess.call(command2, shell=True)
	print "Done"
	return
Beispiel #23
0
def fastQc(fastqfile, outputdir):
    print "running fastqc tool for : " + fastqfile + "..."
    prefix = getPrefix(fastqfile)
    command = "fastqc -o " + outputdir + "/fastqc_report/ " + fastqfile
    subprocess.call(command, shell=True)
    f = open(outputdir + "/fastqc_report/" + prefix + "_fastqc/summary.txt",
             'r')  # open resulting file
    lines = f.readlines()
    f.close()
    find = False
    nbwarn = 0
    for line in lines:  # for each line, if the 1st column is FAIL, then print which test is failed
        value = line.split("\t", 4)
        if value[0] == 'FAIL':
            find = True
            print value[1] + " have failed quality check"
        elif value[0] == 'WARN':
            nbwarn += 1
    if find == False:
        print "Any test failed quality check"
    print str(nbwarn) + " tests present warnings"
    print "Done"
    return
Beispiel #24
0
def scoreAUC(scorefile, outputdir):
	aucfile = getPrefix(scorefile)
	outfile = outputdir+"/"+aucfile+"_AUC.txt"
	command = "Rscript $RCHIPpipe_PATH/Calculate_AUC.r -i="+scorefile+" "+outfile
	subprocess.call(command, shell=True)
	return
Beispiel #25
0
def sortScoreFile(scorefile, outputdir):
    filename = getPrefix(scorefile)
    outfile = outputdir + "/" + filename + "_2.txt"
    command = "sort -k1,1nr " + scorefile + " > " + outfile
    subprocess.call(command, shell=True)
    return outfile
Beispiel #26
0
def newName(file_in, outputdir):  # get new tagAlign file name
    fileprefix = getPrefix(file_in)
    newname = outputdir + fileprefix + ".tagAlign.gz"
    return newname
Beispiel #27
0
def plotRocCurve(scorefile, outputdir):
	rocfile = getPrefix(scorefile)
	outfile = outputdir+"/"+rocfile+"_ROC.png"
	command = "Rscript $RCHIPpipe_PATH/produceROC.r -i="+scorefile+" "+outfile
	subprocess.call(command, shell=True)
	return
Beispiel #28
0
def scoreAUC(scorefile, outputdir):
    aucfile = getPrefix(scorefile)
    outfile = outputdir + "/" + aucfile + "_AUC.txt"
    command = "Rscript $RCHIPpipe_PATH/Calculate_AUC.r -i=" + scorefile + " " + outfile
    subprocess.call(command, shell=True)
    return
Beispiel #29
0
def mainCp(argv):
	if len(argv) == 1: # if any arguments are given print usage message and then exit the programm
		usageCp()
		sys.exit(1)
	outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = initParamCp() # initialize paramters to default
	outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = readOptCp(argv[1:], outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix) # read option from command line and changes parameters if necessary
	checkRequiredCp(rep1, rep2, ctrl1) # Check if the required parameters have been specified
	if selectodir == 'false': # if no output directory specified, create one folder in current directory
		createOdir(outputdir)
	welcomeCp() # print welcome message
	if prefix ==  '': # if no prefix is given in the command line, give a default prefix
		prefix = 'CallPeaks' 
	parametersCp(outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix) # print a summary off all parameters used
	running() # print running message
	print ""
	print "Step1 : Transformation from bam file to tagAlign file..."
	createOdir(outputdir+"/tagAlignfiles") # create new folder to put transformed files
	for file_in in (rep1, rep2, ctrl1):
		toTagAlign(file_in, outputdir+"/tagAlignfiles/") # convert bam files to tagAlign files
	if ctrlsup == 'true': # if there is a second control file, convert it too
		toTagAlign(ctrl2, outputdir+"/tagAlignfiles/")
		ctrl2 = newName(ctrl2, outputdir+"/tagAlignfiles/") # rename file variables
	rep1 = newName(rep1, outputdir+"/tagAlignfiles/")
	rep2 = newName(rep2, outputdir+"/tagAlignfiles/")
	ctrl1 = newName(ctrl1, outputdir+"/tagAlignfiles/")
	print "Step1 : Transformation from bam file to tagAlign file achieved..."
	print ""
	print "Step2 : Merging Control files (if two files are given)..."
	if ctrlsup == 'true': # if there is two control files , merge them as one
		ctrlfile = mergeFile(ctrl1, ctrl2, outputdir+"/tagAlignfiles/", 'Control')
		print "Step2 : Merging Control files achieved..."
	else:
		print "skipped"
		ctrlfile = ctrl1
	print ""
	print "Step3 : Creating Pool of replicates..."
	poolfile = mergeFile(rep1, rep2, outputdir+"/tagAlignfiles/", 'Pool') # merge two sample files to create pool
	print "Step3 : Creating Pool of replicates achieved..."
	print ""
	print "Step4 : Splitting samples files into pseudo replicates..."
	for file_in in (rep1, rep2, poolfile): # for each file given, split randomly into two pseudo replicates
		splitFile(file_in, outputdir+"/tagAlignfiles/")
	print "Step4 : Splitting samples files into pseudo replicates achieved..."
	print ""
	prefixr1 = getPrefix(rep1)
	prefixr2 = getPrefix(rep2)
	prefixpool = getPrefix(poolfile)
	prefixr1 = outputdir+"/tagAlignfiles/"+prefixr1
	prefixr2 = outputdir+"/tagAlignfiles/"+prefixr2
	prefixpool = outputdir+"/tagAlignfiles/"+prefixpool
	print "Step5 : Peak Calling for each files (replicates, pool and pseudo replicates)..."
	createOdir(outputdir+"/PeakCalling") # create new folder to put peak calling output
	for i in (rep1, rep2, poolfile, prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz"):
		peakCall(i, ctrlfile, outputdir+"/PeakCalling/") # for each files given, perform peak calling with spp
	print "Step5 : Peak Calling for each files achieved..."
	print ""
	print "Step6 : IDR analysis..."
	if idr == 'OFF':
		print "Skipped"
	else: # if idr analysis is selected 
		createOdir(outputdir+"/IDR") # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates
		consistency(rep1, rep2, ctrlfile, outputdir) 
		consistency(prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", ctrlfile, outputdir)
		consistency(prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", ctrlfile, outputdir)
		consistency(prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz", ctrlfile, outputdir)
		nt = countConsistentPeaks(rep1, rep2, outputdir, idrthresh) # get number of peaks with IDR lower than specified threshold for each IDR output file
		np = countConsistentPeaks(prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz", outputdir, idrthresh)
		n1 = countConsistentPeaks(prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", outputdir , idrthresh)
		n2 = countConsistentPeaks(prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", outputdir , idrthresh)
		exportResults(nt, np, n1, n2, outputdir) # export metrics in a tab delimited file
		print "Step6 : IDR analysis achieved..."
	print ""
	print "Step7 : Plotting IDR results..."
	if plot == 'OFF':
		print "Skipped"
	else: # if no-plot option is not selected
		createOdir(outputdir+"/IDR/plots") # create new folder to put IDR plot, then, create the plots for each IDR output files
		plotResults(outputdir, rep1, rep2, prefixr1+"_PR1.tagAlign.gz", prefixr1+"_PR2.tagAlign.gz", prefixr2+"_PR1.tagAlign.gz", prefixr2+"_PR2.tagAlign.gz", prefixpool+"_PR1.tagAlign.gz", prefixpool+"_PR2.tagAlign.gz")
		print "Step7 : Plotting IDR results achieved..."
	print ""
	print "Step8 : Creating final sets of peaks..."
	if finalsets == 'OFF':
		print "Skipped"
	else: # if final peak sets creation is asked
		createOdir(outputdir+"/finalsets")  # create new folder to put final peak sets, then, create final peak sets
		createFinalSets(poolfile , ctrlfile, nt, np, outputdir, prefix)
		print "Step8 : Creating final sets of peaks achieved..."
	goodbyeCp() # print end message and then exit
	return
Beispiel #30
0
def mainCpnr(argv):
	if len(argv) == 1: # if any arguments are given print usage message and then exit the programm
		usageCpnr()
		sys.exit(1)
	outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = initParamCpnr() # intialize to default all parameters
	outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp  = readOptCpnr(argv[1:], outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp ) # read option on command line and changes parameters if necessary
	checkRequiredCpnr(bamfile, ctrlfile) # check if the required options have been specified
	if selectodir == 'false': # If no output directory specified, create one folder in current directory
		createOdir(outputdir)
	welcomeCpnr() # print welcome message
	if prefix ==  '': # If no prefix is given in the command line, give a default prefix
		prefix = 'CallPeaks_norep'
	parametersCpnr(outputdir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp) # print a summary of all parameters used
	running() # print running message
	print ""
	print "Step1 : Transformation from bam file to tagAlign file..."
	createOdir(outputdir+"/tagAlignfiles") # create new folder to put all tagAlign files
	for file_in in (bamfile, ctrlfile): # convert all given files from bam to tagAlign
		toTagAlign(file_in, outputdir+"/tagAlignfiles/")
	chipfile = newName(bamfile, outputdir+"/tagAlignfiles/") # rename the files
	ctrlfile = newName(ctrlfile, outputdir+"/tagAlignfiles/")
	print "Step1 : Transformation from bam file to tagAlign file achieved..."
	print ""
	print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..."
	createOdir(outputdir+"/PeakCalling")
	if qc == 'ON' and spp == 'OFF': # Cross-correlation analysis is asked
		qualCheck(chipfile, outputdir+"/PeakCalling", prefix)
		print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..."
		print ""
	else:
		print "skipped"
		print ""
	print "Step3 : PeakCalling using macs2..." 
	if spp == 'OFF':
		peakCallMacs(chipfile, ctrlfile, outputdir, prefix, pvalue, qvalue, thresh)
		print "Step3 : PeakCalling using macs2 achieved..."
		print ""
	else:
		print "skipped"
		print ""
	if spp == 'ON':
		print "Step4 : PeakCalling based on adaptated IDR analysis..."
		print "\tStep4a : Splitting file into pseudo-replicates..."
		splitFile(chipfile, outputdir+"/tagAlignfiles/")
		print "\tStep4a : Splitting file into pseudo-replicates achieved..."
		prefixchip = getPrefix(chipfile)
		prefixchip = outputdir+"/tagAlignfiles/"+prefixchip
		print "Step4b : Peak Calling for each files (replicates and pseudo replicates)..."
		createOdir(outputdir+"/PeakCalling") # create new folder to put peak calling output
		for i in (chipfile, prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz"):
			peakCall(i, ctrlfile, outputdir+"/PeakCalling/") # for each files given, perform peak calling with spp
		print "Step4b : Peak Calling for each files (replicates and pseudo replicates) achieved..."
		print "Step4c : IDR analysis..."
		createOdir(outputdir+"/IDR") # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates
		consistency(prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz", ctrlfile, outputdir)
		idrthresh = 0.01
		np = countConsistentPeaks(prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz", outputdir, idrthresh)
		print "number of consistent peaks between the two pseudo replicates: "+str(np)
		print "Step4c : IDR analysis achieved..."
		print "Step4d : Plotting IDR results..."
		createOdir(outputdir+"/IDR/plots") # create new folder to put IDR plot, then, create the plots for each IDR output files
		plotResults2(outputdir, prefixchip+"_PR1.tagAlign.gz", prefixchip+"_PR2.tagAlign.gz")
		print "Step4d : Plotting IDR results achieved..."
		print "Step5e : Creating final sets of peaks..."
		createOdir(outputdir+"/finalsets")  # create new folder to put final peak sets, then, create final peak sets
		createFinalSets2(chipfile , ctrlfile, np, outputdir, prefix)
	goodbyeCp() # print end of analysis message and the exit
	return
Beispiel #31
0
def newName(file_in, outputdir): # get new tagAlign file name
	fileprefix = getPrefix(file_in)
	newname = outputdir+fileprefix+".tagAlign.gz"
	return newname
Beispiel #32
0
def mainCp(argv):
    if len(
            argv
    ) == 1:  # if any arguments are given print usage message and then exit the programm
        usageCp()
        sys.exit(1)
    outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = initParamCp(
    )  # initialize paramters to default
    outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr, idrthresh, finalsets, plot, prefix = readOptCp(
        argv[1:], outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup,
        idr, idrthresh, finalsets, plot, prefix
    )  # read option from command line and changes parameters if necessary
    checkRequiredCp(
        rep1, rep2,
        ctrl1)  # Check if the required parameters have been specified
    if selectodir == 'false':  # if no output directory specified, create one folder in current directory
        createOdir(outputdir)
    welcomeCp()  # print welcome message
    if prefix == '':  # if no prefix is given in the command line, give a default prefix
        prefix = 'CallPeaks'
    parametersCp(outputdir, selectodir, rep1, rep2, ctrl1, ctrl2, ctrlsup, idr,
                 idrthresh, finalsets, plot,
                 prefix)  # print a summary off all parameters used
    running()  # print running message
    print ""
    print "Step1 : Transformation from bam file to tagAlign file..."
    createOdir(outputdir +
               "/tagAlignfiles")  # create new folder to put transformed files
    for file_in in (rep1, rep2, ctrl1):
        toTagAlign(file_in, outputdir +
                   "/tagAlignfiles/")  # convert bam files to tagAlign files
    if ctrlsup == 'true':  # if there is a second control file, convert it too
        toTagAlign(ctrl2, outputdir + "/tagAlignfiles/")
        ctrl2 = newName(ctrl2,
                        outputdir + "/tagAlignfiles/")  # rename file variables
    rep1 = newName(rep1, outputdir + "/tagAlignfiles/")
    rep2 = newName(rep2, outputdir + "/tagAlignfiles/")
    ctrl1 = newName(ctrl1, outputdir + "/tagAlignfiles/")
    print "Step1 : Transformation from bam file to tagAlign file achieved..."
    print ""
    print "Step2 : Merging Control files (if two files are given)..."
    if ctrlsup == 'true':  # if there is two control files , merge them as one
        ctrlfile = mergeFile(ctrl1, ctrl2, outputdir + "/tagAlignfiles/",
                             'Control')
        print "Step2 : Merging Control files achieved..."
    else:
        print "skipped"
        ctrlfile = ctrl1
    print ""
    print "Step3 : Creating Pool of replicates..."
    poolfile = mergeFile(rep1, rep2, outputdir + "/tagAlignfiles/",
                         'Pool')  # merge two sample files to create pool
    print "Step3 : Creating Pool of replicates achieved..."
    print ""
    print "Step4 : Splitting samples files into pseudo replicates..."
    for file_in in (
            rep1, rep2, poolfile
    ):  # for each file given, split randomly into two pseudo replicates
        splitFile(file_in, outputdir + "/tagAlignfiles/")
    print "Step4 : Splitting samples files into pseudo replicates achieved..."
    print ""
    prefixr1 = getPrefix(rep1)
    prefixr2 = getPrefix(rep2)
    prefixpool = getPrefix(poolfile)
    prefixr1 = outputdir + "/tagAlignfiles/" + prefixr1
    prefixr2 = outputdir + "/tagAlignfiles/" + prefixr2
    prefixpool = outputdir + "/tagAlignfiles/" + prefixpool
    print "Step5 : Peak Calling for each files (replicates, pool and pseudo replicates)..."
    createOdir(outputdir +
               "/PeakCalling")  # create new folder to put peak calling output
    for i in (rep1, rep2, poolfile, prefixr1 + "_PR1.tagAlign.gz",
              prefixr1 + "_PR2.tagAlign.gz", prefixr2 + "_PR1.tagAlign.gz",
              prefixr2 + "_PR2.tagAlign.gz", prefixpool + "_PR1.tagAlign.gz",
              prefixpool + "_PR2.tagAlign.gz"):
        peakCall(i, ctrlfile, outputdir + "/PeakCalling/"
                 )  # for each files given, perform peak calling with spp
    print "Step5 : Peak Calling for each files achieved..."
    print ""
    print "Step6 : IDR analysis..."
    if idr == 'OFF':
        print "Skipped"
    else:  # if idr analysis is selected
        createOdir(
            outputdir + "/IDR"
        )  # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates
        consistency(rep1, rep2, ctrlfile, outputdir)
        consistency(prefixr1 + "_PR1.tagAlign.gz",
                    prefixr1 + "_PR2.tagAlign.gz", ctrlfile, outputdir)
        consistency(prefixr2 + "_PR1.tagAlign.gz",
                    prefixr2 + "_PR2.tagAlign.gz", ctrlfile, outputdir)
        consistency(prefixpool + "_PR1.tagAlign.gz",
                    prefixpool + "_PR2.tagAlign.gz", ctrlfile, outputdir)
        nt = countConsistentPeaks(
            rep1, rep2, outputdir, idrthresh
        )  # get number of peaks with IDR lower than specified threshold for each IDR output file
        np = countConsistentPeaks(prefixpool + "_PR1.tagAlign.gz",
                                  prefixpool + "_PR2.tagAlign.gz", outputdir,
                                  idrthresh)
        n1 = countConsistentPeaks(prefixr1 + "_PR1.tagAlign.gz",
                                  prefixr1 + "_PR2.tagAlign.gz", outputdir,
                                  idrthresh)
        n2 = countConsistentPeaks(prefixr2 + "_PR1.tagAlign.gz",
                                  prefixr2 + "_PR2.tagAlign.gz", outputdir,
                                  idrthresh)
        exportResults(nt, np, n1, n2,
                      outputdir)  # export metrics in a tab delimited file
        print "Step6 : IDR analysis achieved..."
    print ""
    print "Step7 : Plotting IDR results..."
    if plot == 'OFF':
        print "Skipped"
    else:  # if no-plot option is not selected
        createOdir(
            outputdir + "/IDR/plots"
        )  # create new folder to put IDR plot, then, create the plots for each IDR output files
        plotResults(outputdir, rep1, rep2, prefixr1 + "_PR1.tagAlign.gz",
                    prefixr1 + "_PR2.tagAlign.gz",
                    prefixr2 + "_PR1.tagAlign.gz",
                    prefixr2 + "_PR2.tagAlign.gz",
                    prefixpool + "_PR1.tagAlign.gz",
                    prefixpool + "_PR2.tagAlign.gz")
        print "Step7 : Plotting IDR results achieved..."
    print ""
    print "Step8 : Creating final sets of peaks..."
    if finalsets == 'OFF':
        print "Skipped"
    else:  # if final peak sets creation is asked
        createOdir(
            outputdir + "/finalsets"
        )  # create new folder to put final peak sets, then, create final peak sets
        createFinalSets(poolfile, ctrlfile, nt, np, outputdir, prefix)
        print "Step8 : Creating final sets of peaks achieved..."
    goodbyeCp()  # print end message and then exit
    return
Beispiel #33
0
def mainCpnr(argv):
    if len(
            argv
    ) == 1:  # if any arguments are given print usage message and then exit the programm
        usageCpnr()
        sys.exit(1)
    outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = initParamCpnr(
    )  # intialize to default all parameters
    outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc, prefix, spp = readOptCpnr(
        argv[1:], outputdir, selectodir, bamfile, ctrlfile, thresh, pvalue,
        qvalue, qc, prefix,
        spp)  # read option on command line and changes parameters if necessary
    checkRequiredCpnr(
        bamfile, ctrlfile)  # check if the required options have been specified
    if selectodir == 'false':  # If no output directory specified, create one folder in current directory
        createOdir(outputdir)
    welcomeCpnr()  # print welcome message
    if prefix == '':  # If no prefix is given in the command line, give a default prefix
        prefix = 'CallPeaks_norep'
    parametersCpnr(outputdir, bamfile, ctrlfile, thresh, pvalue, qvalue, qc,
                   prefix, spp)  # print a summary of all parameters used
    running()  # print running message
    print ""
    print "Step1 : Transformation from bam file to tagAlign file..."
    createOdir(outputdir +
               "/tagAlignfiles")  # create new folder to put all tagAlign files
    for file_in in (bamfile,
                    ctrlfile):  # convert all given files from bam to tagAlign
        toTagAlign(file_in, outputdir + "/tagAlignfiles/")
    chipfile = newName(bamfile,
                       outputdir + "/tagAlignfiles/")  # rename the files
    ctrlfile = newName(ctrlfile, outputdir + "/tagAlignfiles/")
    print "Step1 : Transformation from bam file to tagAlign file achieved..."
    print ""
    print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..."
    createOdir(outputdir + "/PeakCalling")
    if qc == 'ON' and spp == 'OFF':  # Cross-correlation analysis is asked
        qualCheck(chipfile, outputdir + "/PeakCalling", prefix)
        print "Step2 : Cross-correlation by phantomPeaksQualTools before calling peaks..."
        print ""
    else:
        print "skipped"
        print ""
    print "Step3 : PeakCalling using macs2..."
    if spp == 'OFF':
        peakCallMacs(chipfile, ctrlfile, outputdir, prefix, pvalue, qvalue,
                     thresh)
        print "Step3 : PeakCalling using macs2 achieved..."
        print ""
    else:
        print "skipped"
        print ""
    if spp == 'ON':
        print "Step4 : PeakCalling based on adaptated IDR analysis..."
        print "\tStep4a : Splitting file into pseudo-replicates..."
        splitFile(chipfile, outputdir + "/tagAlignfiles/")
        print "\tStep4a : Splitting file into pseudo-replicates achieved..."
        prefixchip = getPrefix(chipfile)
        prefixchip = outputdir + "/tagAlignfiles/" + prefixchip
        print "Step4b : Peak Calling for each files (replicates and pseudo replicates)..."
        createOdir(
            outputdir +
            "/PeakCalling")  # create new folder to put peak calling output
        for i in (chipfile, prefixchip + "_PR1.tagAlign.gz",
                  prefixchip + "_PR2.tagAlign.gz"):
            peakCall(i, ctrlfile, outputdir + "/PeakCalling/"
                     )  # for each files given, perform peak calling with spp
        print "Step4b : Peak Calling for each files (replicates and pseudo replicates) achieved..."
        print "Step4c : IDR analysis..."
        createOdir(
            outputdir + "/IDR"
        )  # create new folder to put IDR output, then, perform IDR analysis between each replicates and each pseudo-replicates
        consistency(prefixchip + "_PR1.tagAlign.gz",
                    prefixchip + "_PR2.tagAlign.gz", ctrlfile, outputdir)
        idrthresh = 0.01
        np = countConsistentPeaks(prefixchip + "_PR1.tagAlign.gz",
                                  prefixchip + "_PR2.tagAlign.gz", outputdir,
                                  idrthresh)
        print "number of consistent peaks between the two pseudo replicates: " + str(
            np)
        print "Step4c : IDR analysis achieved..."
        print "Step4d : Plotting IDR results..."
        createOdir(
            outputdir + "/IDR/plots"
        )  # create new folder to put IDR plot, then, create the plots for each IDR output files
        plotResults2(outputdir, prefixchip + "_PR1.tagAlign.gz",
                     prefixchip + "_PR2.tagAlign.gz")
        print "Step4d : Plotting IDR results achieved..."
        print "Step5e : Creating final sets of peaks..."
        createOdir(
            outputdir + "/finalsets"
        )  # create new folder to put final peak sets, then, create final peak sets
        createFinalSets2(chipfile, ctrlfile, np, outputdir, prefix)
    goodbyeCp()  # print end of analysis message and the exit
    return
Beispiel #34
0
def plotRocCurve(scorefile, outputdir):
    rocfile = getPrefix(scorefile)
    outfile = outputdir + "/" + rocfile + "_ROC.png"
    command = "Rscript $RCHIPpipe_PATH/produceROC.r -i=" + scorefile + " " + outfile
    subprocess.call(command, shell=True)
    return