Example #1
0
def updateCons(path, db, motifChrom='chr17'):
    """update conservation scores in gzipped fixedStep wiggle format"""
    mcollection = db["hg19"+motifChrom]
    for infile in glob.glob(os.path.join(path, "*.wigFix.gz")):
	(wigpath,wigfilename) = os.path.split(infile)
	chrom = wigfilename.split('.')[0]
	consName = '_'.join(wigfilename.split('.')[1:-2])
	print 'updating', consName
	#print chrom, tfName, consName
	with gzip.open(infile) as wigFile:
	    #wig = csv.reader(wigFile,delimiter='\t')
	    bwFile = os.path.join(wigpath,consName+'.bw')
	    if not os.path.isfile(bwFile):
		countWig.compressFixWig(wigFile, consName, bwFile)
	    stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName)
	    start = startDict[consName][chrom]
	    arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName)
	    cursor = mcollection.find()#{"tf_name": tfName, 
				#"genomic_region.chr": chrom})
	    print mcollection.count()
	    #num = 0
	    #avg = 0
	    for test in cursor:
		motifStart, motifEnd = test["genomic_region"]["start"], test["genomic_region"]["end"]
		#num += 1
		#print avg#motifStart, motifEnd, num	
		avg = 0
		startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])]
		if motifEnd > start[-1]:
		    startlist.append(start[-1])
		for i in xrange(len(startlist)):
		    #if avg != 0:
			#if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments
		    ss = startlist[i]
		    xs, xvals, sums, ll = arrayDict[ss]
		    if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in
			if avg == 'NA' and i == len(startlist)-1:
			    avg = 0
			avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1)
		    elif ss <= motifStart < motifEnd <= ss+ll-1:##in array
			avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0]
		    elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array
			if avg == 'NA':
			    avg = 0 
			avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1)
		    elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out
			if avg == 'NA' and i == len(startlist)-1:
			    avg = 0
			avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1)
		    elif ss+ll-1 < motifStart:
			if avg == 0:
			    #print 'here-->', motifStart, motifEnd, ss, ll
			    avg = 'NA'
		    elif motifEnd < ss:
			print "this should not happen-- motifStart < motifEnd < ss "
			if avg == 0:
			    avg = 'NA'
		if avg != 'NA':
		    mcollection.update({"_id":test["_id"]},{"$set":{"cons."+consName: avg}}, upsert = True)
    return 0
Example #2
0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=20, method="Binom"):#, flankWin=35):
    """calculate fos from discontinuous variableStep wiggle files
	with two method options:
		NSD/Binomial test"""
    mcollection = db["hg19"+motifChrom]
    print 'updating fos', motifChrom
    for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)):
	#(wigpath,wigfile) = os.path.split(infile)
	#(wigfilename,ext) = os.path.splitext(infile)
	wigfilename = infile.split(motifChrom)[0]
	expName = "fos"
	ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0]
	wigFile = open(infile,'rt')
	#wig = csv.reader(wigFile,delimiter='\t')
	#bwFile = os.path.join(path,wigfilename+'.bw')
	#countWig.compressVarWig(wigFile, expName, wigfilename)
	bwFile = wigfilename+motifChrom+'.bw'
	if not os.path.isfile(bwFile):
	    countWig.compressVarWig(wigFile, expName, wigfilename)
	coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName)
	arrayDict = defaultdict(list)
	cursor = mcollection.find({"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}})
		#{"tf_name": tfName,
#		"motif_score":{"$lt":1e-4},
#		"motif_genomic_regions_info.chr": motifChrom})
	for test in cursor:
	    if not motifChrom in arrayDict:
		arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    xs, xvals, sums = arrayDict[motifChrom]
	    motifStart = test["genomic_region"]["start"]
	    motifEnd = test["genomic_region"]["end"]
	    flankWin = round((motifEnd - motifStart + 1)*1.75)
	    flankL = max(0, motifStart - flankWin)
	    flankR = motifEnd + flankWin
	    countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
	    countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
	    countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
	    if method == "NSD":
		try:
		    fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count)
		except ZeroDivisionError:
		    fos = 0 
	    elif method == "Binom":
		try:
		    fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 
			1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart)))
		except ZeroDivisionError:
		    fos = 0

	    if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)):
		mcollection.update({"_id":test["_id"]},{"$set":{"dgf.fos": fos}}, upsert = True)
	    	#print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    return 0 	
Example #3
0
def updateCount(path, tfName):
    """update count data in discontinuous variableStep wiggle format"""
    for infile in glob.glob(os.path.join(path,"*.wig")):
        #(wigpath,wigfilename) = os.path.split(infile)
	(wigfilename,ext) = os.path.splitext(infile)
        ##depends on the data type and source
        expName = "Dnase"##or add an expName parser line
        ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0]
        wigFile = open(infile,'rt')
        #wig = csv.reader(wigFile,delimiter='\t')
	countWig.compressVarWig(wigFile, expName, wigfilename)
        coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName)
        arrayDict = defaultdict(list)
        cursor = mcollection.find({"tf_name": tfName})
        for test in cursor:
            motifChrom = test["motif_genomic_regions_info"]["chr"]
	    motifStart = test["motif_genomic_regions_info"]["start"] 
	    motifEnd = test["motif_genomic_regions_info"]["end"]
            if not motifChrom in arrayDict:
                arrayDict[motifChrom] = countWig.buildHist(motifChrom,coordDict,valuesDict,ctName)
            xs, xvals, sums = arrayDict[motifChrom]
            count = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0]
            #print count
	    #mcollection.update({"_id":test["_id"]},{"$set":{expName: count}}, upsert = True)
            test["ct_info"]["accessibility_score"][expName] = count
            mcollection.save(test)
    return 0
Example #4
0
def updateCount(path, db, motifChrom='chr17', window=100):
    """update count data in discontinuous variableStep wiggle format"""
    mcollection = db["hg19"+motifChrom]
    for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)):
        #(wigpath,wigfilename) = os.path.split(infile)
	#(wigfilename,ext) = os.path.splitext(infile)
	wigfilename = infile.split(motifChrom)[0]
        ##depends on the data type and source
        expName = "dgf"##or add an expName parser line
        ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0]
        wigFile = open(infile,'rt')
        #wig = csv.reader(wigFile,delimiter='\t')
	if not os.path.isfile(wigfilename+motifChrom+'.bw'):
	    countWig.compressVarWig(wigFile, expName, wigfilename)
        coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName)
        arrayDict = defaultdict(list)
        cursor = mcollection.find()#{"tf_name": tfName})
        for test in cursor:
            #motifChrom = test["genomic_region"]["chr"]
	    motifStart = test["genomic_region"]["start"] 
	    motifEnd = test["genomic_region"]["end"]
            if not motifChrom in arrayDict:
                arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
            xs, xvals, sums = arrayDict[motifChrom]
            count = countWig.queryHist(xs, xvals, sums, motifStart-window, motifEnd+window, varWindow=True)[0]
            #print count
	    mcollection.update({"_id":test["_id"]},{"$set":{expName+"."+ctName: count}}, upsert = True)
            #test["ct_info"]["accessibility_score"][expName] = count
            #mcollection.save(test)
    return 0
Example #5
0
def updateFOS(path, tfName, motifChrom, method="NSD", flankWin=35):
    """calculate fos from discontinuous variableStep wiggle files
	with two method options:
		NSD/Binomial test"""
    for infile in glob.glob(os.path.join(path,"*.wig")):
	#(wigpath,wigfile) = os.path.split(infile)
	(wigfilename,ext) = os.path.splitext(infile)
	expName = "FOS"
	ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0]
	#wigFile = open(infile,'rt')
	#wig = csv.reader(wigFile,delimiter='\t')
	#bwFile = os.path.join(path,wigfilename+'.bw')
	#countWig.compressVarWig(wigFile, expName, wigfilename)
	coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName)
	arrayDict = defaultdict(list)
	cursor = mcollection.find({"tf_name": tfName,
		"motif_score":{"$lt":1e-4},
		"motif_genomic_regions_info.chr": motifChrom})
	for test in cursor:
	    if not motifChrom in arrayDict:
		arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    xs, xvals, sums = arrayDict[motifChrom]
	    motifStart = test["motif_genomic_regions_info"]["start"]
	    motifEnd = test["motif_genomic_regions_info"]["end"]
	    flankL = max(0, motifStart - flankWin)
	    flankR = motifEnd + flankWin
	    countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd)[2]
	    countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR)[2]
	    countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100)[2]
	    if method == "NSD":
		try:
		    fos = np.sqrt((countTot-countCent)/countTot)-np.sqrt(countCent/countTot)
		except ZeroDivisionError:
		    fos = 0 
	    elif method == "Binom":
		try:
		    fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart)))
		except ZeroDivisionError:
		    fos = 0
	    #mcollection.update({"_id":test["_id"]},{"$set":{"motif_ct_info.fos": {ctName:{method:fos}}}}, upsert = True)
	    if fos > 0.95 and count-countCent > 18:#(flankR-flankL-(motifEnd-motifStart)):
	    	print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    return 0 	
Example #6
0
def getCons(path, tfName):
    for infile in glob.glob(os.path.join(path, "*.wigFix")):
	(wigpath,wigfilename) = os.path.split(infile)
	chrom = wigfilename.split('.')[0]
	consName = '_'.join(wigfilename.split('.')[1:3])
	#print chrom, tfName, consName
	with open(infile,'rt') as wigFile:
	    wig = csv.reader(wigFile,delimiter='\t')
	    stepDict, startDict, valuesDict = countWig.getFixStart(wig,consName)#'phyloP30wayEuarchontoglires')
	    start = startDict[consName][chrom]
	    arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName)
	    cursor = mcollection.find({"tf_name": tfName, 
				"motif_genomic_regions_info.chr": chrom})
	    for test in cursor:
	        motifStart, motifEnd = test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"]
	    	avg = 0
		#print motifStart, motifEnd
		startlist = [start[i] for i in xrange(len(start)-1) if (motifStart > start[i] and motifStart < start[i+1]) or (motifEnd > start[i] and motifEnd < start[i+1])] 
		if len(startlist) > 0:
		    print startlist
		#print start[-1]
		startlist.append(start[-1])
	    	for i in xrange(len(startlist)):
		    #print arrayDict[start[i]]
		 #   if avg != 0 and motifEnd < start[i]:
		  #  	break ##fall into range and break out
		    if avg != 0:
			if motifEnd > startlist[i]:##cases of partial overlap need to renormalize over two fragments
		    	    xs, xvals, sums = arrayDict[startlist[i]]
		    	    avg = avg * (startlist[i] - motifStart) + (countWig.queryHist(xs, 
				xvals, sums, motifStart, motifEnd)[0] * (motifEnd - startlist[i] + 1)) / (motifEnd - motifStart + 1)	
			else:
			    break
		    elif avg == 0:
		   	xs, xvals, sums = arrayDict[startlist[i]]
		   	avg = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0]
	    	if avg > 0:
	            print avg, motifStart, motifEnd
	    	    mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True)
		else:
		    mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True)
	    	#mcollection.save(test)
    return 0
Example #7
0
def getCount(path, tfName):
    for infile in glob.glob(os.path.join(path,"*.wig")):
        (wigpath,wigfilename) = os.path.split(infile)
        ##depends on the data type and source
        methodName = "Dnase"
        ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0]
        wigFile = open(infile,'rt')
        wig = csv.reader(wigFile,delimiter='\t')
        coordDict, valuesDict = countWig.getCoord(wig,ctName)
        arrayDict = defaultdict(list)
        cursor = mcollection.find({"tf_name": tfName})
        for test in cursor:
            motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"]
            if not motifChrom in arrayDict:
                arrayDict[motifChrom] = countWig.buildHist(motifChrom,coordDict,valuesDict,ctName)
            xs, xvals, sums = arrayDict[motifChrom]
            count = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0]
            #print count
            test["ct_info"]["accessibility_score"][methodName] = count
            mcollection.save(test)
    return 0
Example #8
0
def updateFPS(infile, outpath, tfname= "CTCF", motifChrom="chr15", ctName="Gm12878", dgfCutoff=36, expName="atacseq"):#, stranded=False):
        """calculate footprint scores from discontinuous variableStep wiggle files"""
	
	#check if directories exists
	motifdir = os.path.join(outpath,"bedMotifs")
	fpsdir = os.path.join(outpath,"fpsMotifs")
	if not os.path.isdir(motifdir):
		print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores."
		sys.exit()
		#os.mkdir(motifdir)
	if not os.path.isdir(fpsdir):
		os.mkdir(fpsdir)
	
	print 'updating fps for ', ctName, motifChrom

	wigfilename = re.split(".wig",infile)[0]#os.path.join(inpath,"SRR8912"+"68"+"sort_cut")
	gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz"),'r')
	gcoords = csv.reader(gcoordsfile, delimiter='\t')
	fpsfile = gzip.open(os.path.join(fpsdir,tfname+ctName+motifChrom+'fps.txt.gz'),'w')
	writer = csv.writer(fpsfile, delimiter='\t')

	##non-strand-specific fps for atac-seq
	if expName=="atacseq":
		wigFile = open(infile,'rt')
		bwFile = wigfilename+motifChrom+'.bw'
		if not os.path.isfile(bwFile):
	    		countWig.compressVarWig(wigFile, ctName, wigfilename)
		coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName)
		arrayDict = defaultdict(list)
		for test in gcoords:
	    		if not motifChrom in arrayDict:
				arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    		xs, xvals, sums = arrayDict[motifChrom]
	    		motifStart, motifEnd = int(test[1]), int(test[2])

	    		flankWin = round((motifEnd - motifStart + 1)*1.75)#35
	    		flankL = max(0, int(motifStart - flankWin))
	    		flankR = int(motifEnd + flankWin)
	    		countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
	    		countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
	    		countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    		count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
			count = count-countCent
			if count >= dgfCutoff:
        			acces = 1.0
        			fragP = array("d")
				fragN = array("d")
				for i in xrange(flankL,motifEnd):
                			c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2]
                			fragP.append(c)

        			#Centp = motifStart-flankL
        			countTotL = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
        			for i in xrange(motifStart,flankR):
                			c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2]
                			fragN.append(c)

        			#Centn = motifEnd+1-motifStart
        			countTotR = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
				try:
                			pp = binom.cdf(countCent,countTotL,float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
                			pn = binom.cdf(countCent,countTotR,float(motifEnd+1-motifStart)/(flankR+1-motifStart))
                			fos = pp*pn
        			except ZeroDivisionError:
                			fos = 1.0

        			##fdr correction
        			fosArray = array("d")
        			for s in xrange(500):
                			random.shuffle(fragP)
					random.shuffle(fragN)
                			try:
                        			pp = binom.cdf(sum(fragP[(motifStart-flankL):]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
                        			pn = binom.cdf(sum(fragN[:(motifEnd+1-motifStart)]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart))
                        			fosArray.append(pp*pn)
                			except ZeroDivisionError:
                        			fosArray.append(1.0)
        			fosCutoff = np.sort(fosArray)[4]
        			#round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01:
        			if fos <= fosCutoff:
                			fps = 1.0##profile
        			else:
                			fps = 0.0##no profile
        			#print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr)
			else:
                        	fos = 1.0+(1/(count+1.0))
                        	fps = -1.0
                        	fosCutoff = -1.0
                        	acces = 0.0
                	row = [count,acces,fos,fosCutoff,fps]
                	writer.writerows([row])

	#run strand-specific calls for DNase-Seq or DGF
    	else:
		shortname = os.path.join(os.path.split(wigfilename)[0],re.split("_",os.path.split(wigfilename)[1])[0])
		infilep = shortname + "_p.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_p_cut.wig")
		infilen = shortname + "_n.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_n_cut.wig")
		wigFilep = open(infilep,'rt')
		wigFilen = open(infilen,'rt')
		bwFilep = shortname+'_p'+motifChrom+'.bw'
		bwFilen = shortname+'_n'+motifChrom+'.bw'
		if not os.path.isfile(bwFilep):
			countWig.compressVarWig(wigFilep,expName,shortname+'_p')
		if not os.path.isfile(bwFilen):
			countWig.compressVarWig(wigFilen,expName,shortname+'_n')
		coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName)
		coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName)
		arrayDictp = defaultdict(list)
		arrayDictn = defaultdict(list)

		for test in gcoords:
			if not motifChrom in arrayDictp:
				arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName)
			if not motifChrom in arrayDictn:
				arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName)

			motifStart,motifEnd = int(test[1]),int(test[2])
			flankWin = round((motifEnd - motifStart + 1)*2)#1.75)
			flankL= max(0, int(motifStart-flankWin))
			flankR = int(motifEnd + flankWin)

			xsp, xvalsp, sumsp = arrayDictp[motifChrom]
			xsn, xvalsn, sumsn = arrayDictn[motifChrom]

			countCentp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart, motifEnd)[2]
			countCentn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart, motifEnd)[2]
			countp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart-100, motifEnd+100, varWindow=True)[2]
			countn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart-100, motifEnd+100, varWindow=True)[2]
			
			count = countp+countn-countCentp-countCentn
			if count >= dgfCutoff:
				acces = 1.0
				fragP = array("d")
				fragN = array("d")
				for i in xrange(flankL,motifEnd):
					c = countWig.queryHist(xsp, xvalsp, sumsp, i, i+1, varWindow=True)[2]
					fragP.append(c)

				Centp = motifStart-flankL
				countTotLp = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
				#countCentp = sum(fragP[Centp:])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
				#countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]

				for i in xrange(motifStart,flankR):
                                	c = countWig.queryHist(xsn, xvalsn, sumsn, i, i+1, varWindow=True)[2]
                                	fragN.append(c) 

				Centn = motifEnd+1-motifStart
				countTotRn = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
				#countCentn = sum(fragN[:Centn])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
				#countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]

				try:
					pp = binom.cdf(countCentp,countTotLp,float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
					pn = binom.cdf(countCentn,countTotRn,float(motifEnd+1-motifStart)/(flankR+1-motifStart))
					fos = pp*pn
				except ZeroDivisionError:
					fos = 1.0

				##fdr correction
				fosArray = array("d")
				for s in xrange(500):
					random.shuffle(fragP)
					random.shuffle(fragN)
					try:
						pp = binom.cdf(sum(fragP[Centp:]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
						pn = binom.cdf(sum(fragN[:Centn]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart))
						fosArray.append(pp*pn)
					except ZeroDivisionError:
						fosArray.append(1.0)
				fosCutoff = np.sort(fosArray)[4]
				#round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01:
				if fos <= fosCutoff:
					fps = 1.0##profile
				else:
					fps = 0.0##no profile
				#print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr)
			else:
				fos = 1.0+(1/(count+1.0))
				fps = -1.0
				fosCutoff = -1.0
				acces = 0.0
			row = [count,acces,fos,fosCutoff,fps]
			writer.writerows([row])
	fpsfile.close()	
        return 0 
Example #9
0
def updateCons(inpath, tfname, motifChrom, outpath):
    """update conservation scores in gzipped fixedStep wiggle format"""

    #check if directories exists
    motifdir = os.path.join(outpath,"bedMotifs")
    consdir = os.path.join(outpath,"consMotifs")
    if not os.path.isdir(motifdir):
	print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores."
        sys.exit()
    	#os.mkdir(motifdir)
    if not os.path.isdir(consdir):
    	os.mkdir(consdir)
    gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz"))
    consfile = gzip.open(os.path.join(consdir,tfname+motifChrom+'cons.txt.gz'),'w')
    writer = csv.writer(consfile, delimiter='\t')
    l = []
    consTypes = ["phastCons100way","phastCons46way","phastCons46way.placental","phastCons46way.primates",
                "phyloP100way","phyloP46way","phyloP46way.placental","phyloP46way.primate"]

    for consType in consTypes:
        infile = os.path.join(inpath, motifChrom+"."+consType+".wigFix.gz")
        (wigpath,wigfilename) = os.path.split(infile)
        chrom = wigfilename.split('.')[0]
        consName = '_'.join(wigfilename.split('.')[1:-2])

        print 'updating', consName

        gcoordsfile.seek(0)
        gcoords = csv.reader(gcoordsfile, delimiter='\t')

        with gzip.open(infile) as wigFile:
            bwFile = os.path.join(wigpath,motifChrom+"."+consName+'.bw')
            if not os.path.isfile(bwFile):
                countWig.compressFixWig(wigFile, consName, bwFile)
            stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName)
            start = startDict[consName][chrom]
            arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName)
            r = []
	
	    for test in gcoords:
                motifStart, motifEnd = int(test[1]), int(test[2])
                #print motifStart, motifEnd
                avg = 0
                startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])]
                if motifEnd > start[-1]:
                    startlist.append(start[-1])
                for i in xrange(len(startlist)):
                    #if avg != 0:
                        #if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments
                    ss = startlist[i]
                    xs, xvals, sums, ll = arrayDict[ss]
                    if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in
                        if avg == 'NA' and i == len(startlist)-1:
                            avg = 0
                        avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1)
                    elif ss <= motifStart < motifEnd <= ss+ll-1:##in array
                        avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0]
                    elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array
                        if avg == 'NA':
                            avg = 0
                        avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1)
                    elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out
                        if avg == 'NA' and i == len(startlist)-1:
                            avg = 0
                        avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1)
                    elif ss+ll-1 < motifStart:
                        if avg == 0:
                            #print '...', motifStart, motifEnd, ss, ll
                            avg = 'NA'
                    elif motifEnd < ss:
                        print "Error: motifStart < motifEnd < ss "
                        if avg == 0:
                            avg = 'NA'

                r.append(avg)
            l.append(tuple(r))
    wl = zip(*l)

    for i in wl:
        writer.writerows([list(i)])

    consfile.close()

    return 0
Example #10
0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=11, method="Binom", stranded=False):#, flankWin=35):
    """calculate fos from discontinuous variableStep wiggle files
	with two method options:
		NSD/Binomial test"""
    mcollection = db["hg19"+motifChrom]
    print 'updating fos', motifChrom
    expName="fos"
    for infile in glob.glob(os.path.join(path,"wgEncodeUwDnaseGm12878Aln_cut"+motifChrom)):
	#(wigpath,wigfile) = os.path.split(infile)
	#(wigfilename,ext) = os.path.splitext(infile)
	wigfilename = re.split(motifChrom,infile)[0]#"_._cut",infile)[0]
	ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0]
	if not stranded:
		wigFile = open(infile,'rt')
		#wig = csv.reader(wigFile,delimiter='\t')
		#bwFile = os.path.join(path,wigfilename+'.bw')
		#countWig.compressVarWig(wigFile, expName, wigfilename)
		bwFile = wigfilename+motifChrom+'.bw'
		if not os.path.isfile(bwFile):
	    		countWig.compressVarWig(wigFile, expName, wigfilename)
		coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName)
		arrayDict = defaultdict(list)
		cursor = mcollection.find()#{"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}})
		for test in cursor:
	    		if not motifChrom in arrayDict:
				arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    		xs, xvals, sums = arrayDict[motifChrom]
	    		motifStart = test["genomic_region"]["start"]
	    		motifEnd = test["genomic_region"]["end"]
	    		flankWin = round((motifEnd - motifStart + 1)*1.75)
	    		flankL = max(0, motifStart - flankWin)
	    		flankR = motifEnd + flankWin
	    		countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
	    		countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
	    		countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    		count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
	    		if method == "NSD":
				try:
		    			fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count)
				except ZeroDivisionError:
		    			fos = 0 
	    		elif method == "Binom":
				try:
		    			fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 
					1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart)))
				except ZeroDivisionError:
		    			fos = 0
	    		if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)):
				mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fos": fos}}, upsert = True)
	    			#print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    	else:
		infilep = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_p_cut.wig")
		infilen = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_n_cut.wig")
		wigFilep = open(infilep,'rt')
		wigFilen = open(infilen,'rt')
		bwFilep = wigfilename+'_p_cut'+motifChrom+'.bw'
		bwFilen = wigfilename+'_n_cut'+motifChrom+'.bw'
		if not os.path.isfile(bwFilep):
			countWig.compressVarWig(wigFilep,expName,wigfilename+'_p_cut')
		if not os.path.isfile(bwFilen):
			countWig.compressVarWig(wigFilen,expName,wigfilename+'_n_cut')
		coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName)
		coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName)
		arrayDictp = defaultdict(list)
		arrayDictn = defaultdict(list)
		cursor = mcollection.find()
		for test in cursor:
			if not motifChrom in arrayDictp:
				arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName)
			if not motifChrom in arrayDictn:
				arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName)
			xs, xvals, sums = arrayDictp[motifChrom]
			motifStart,motifEnd = test["genomic_region"]["start"],test["genomic_region"]["end"]
			flankWin = round((motifEnd - motifStart + 1)*1.75)
			flankL= max(0, motifStart-flankWin)
			flankR = motifEnd + flankWin
			countTotLp = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
			countCentp = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
			countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
			xs, xvals, sums = arrayDictn[motifChrom] 
			countTotRn = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
			countCentn = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
			countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
			if method == "Binom":
				try:
					pp = 1 - binom.cdf(countCentp,countTotLp,float(motifEnd-motifStart)/(motifEnd-flankL))
					pn = 1 - binom.cdf(countCentn,countTotRn,float(motifEnd-motifStart)/(flankR-motifStart))
					fos = pp*pn
				except ZeroDivisionError:
					fos = 0
			if fos > 0.95 and countp+countn-countCentp-countCentn > dgfCutoff:
				mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fosStrand": fos}}, upsert = True)
				print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    return 0