Beispiel #1
0
def updateCons(path, db, motifChrom='chr17'):
    """update conservation scores in gzipped fixedStep wiggle format"""
    mcollection = db["hg19"+motifChrom]
    for infile in glob.glob(os.path.join(path, "*.wigFix.gz")):
	(wigpath,wigfilename) = os.path.split(infile)
	chrom = wigfilename.split('.')[0]
	consName = '_'.join(wigfilename.split('.')[1:-2])
	print 'updating', consName
	#print chrom, tfName, consName
	with gzip.open(infile) as wigFile:
	    #wig = csv.reader(wigFile,delimiter='\t')
	    bwFile = os.path.join(wigpath,consName+'.bw')
	    if not os.path.isfile(bwFile):
		countWig.compressFixWig(wigFile, consName, bwFile)
	    stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName)
	    start = startDict[consName][chrom]
	    arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName)
	    cursor = mcollection.find()#{"tf_name": tfName, 
				#"genomic_region.chr": chrom})
	    print mcollection.count()
	    #num = 0
	    #avg = 0
	    for test in cursor:
		motifStart, motifEnd = test["genomic_region"]["start"], test["genomic_region"]["end"]
		#num += 1
		#print avg#motifStart, motifEnd, num	
		avg = 0
		startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])]
		if motifEnd > start[-1]:
		    startlist.append(start[-1])
		for i in xrange(len(startlist)):
		    #if avg != 0:
			#if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments
		    ss = startlist[i]
		    xs, xvals, sums, ll = arrayDict[ss]
		    if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in
			if avg == 'NA' and i == len(startlist)-1:
			    avg = 0
			avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1)
		    elif ss <= motifStart < motifEnd <= ss+ll-1:##in array
			avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0]
		    elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array
			if avg == 'NA':
			    avg = 0 
			avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1)
		    elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out
			if avg == 'NA' and i == len(startlist)-1:
			    avg = 0
			avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1)
		    elif ss+ll-1 < motifStart:
			if avg == 0:
			    #print 'here-->', motifStart, motifEnd, ss, ll
			    avg = 'NA'
		    elif motifEnd < ss:
			print "this should not happen-- motifStart < motifEnd < ss "
			if avg == 0:
			    avg = 'NA'
		if avg != 'NA':
		    mcollection.update({"_id":test["_id"]},{"$set":{"cons."+consName: avg}}, upsert = True)
    return 0
Beispiel #2
0
def getCons(path, tfName):
    for infile in glob.glob(os.path.join(path, "*.wigFix")):
	(wigpath,wigfilename) = os.path.split(infile)
	chrom = wigfilename.split('.')[0]
	consName = '_'.join(wigfilename.split('.')[1:3])
	#print chrom, tfName, consName
	with open(infile,'rt') as wigFile:
	    wig = csv.reader(wigFile,delimiter='\t')
	    stepDict, startDict, valuesDict = countWig.getFixStart(wig,consName)#'phyloP30wayEuarchontoglires')
	    start = startDict[consName][chrom]
	    arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName)
	    cursor = mcollection.find({"tf_name": tfName, 
				"motif_genomic_regions_info.chr": chrom})
	    for test in cursor:
	        motifStart, motifEnd = test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"]
	    	avg = 0
		#print motifStart, motifEnd
		startlist = [start[i] for i in xrange(len(start)-1) if (motifStart > start[i] and motifStart < start[i+1]) or (motifEnd > start[i] and motifEnd < start[i+1])] 
		if len(startlist) > 0:
		    print startlist
		#print start[-1]
		startlist.append(start[-1])
	    	for i in xrange(len(startlist)):
		    #print arrayDict[start[i]]
		 #   if avg != 0 and motifEnd < start[i]:
		  #  	break ##fall into range and break out
		    if avg != 0:
			if motifEnd > startlist[i]:##cases of partial overlap need to renormalize over two fragments
		    	    xs, xvals, sums = arrayDict[startlist[i]]
		    	    avg = avg * (startlist[i] - motifStart) + (countWig.queryHist(xs, 
				xvals, sums, motifStart, motifEnd)[0] * (motifEnd - startlist[i] + 1)) / (motifEnd - motifStart + 1)	
			else:
			    break
		    elif avg == 0:
		   	xs, xvals, sums = arrayDict[startlist[i]]
		   	avg = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0]
	    	if avg > 0:
	            print avg, motifStart, motifEnd
	    	    mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True)
		else:
		    mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True)
	    	#mcollection.save(test)
    return 0
Beispiel #3
0
def updateCons(inpath, tfname, motifChrom, outpath):
    """update conservation scores in gzipped fixedStep wiggle format"""

    #check if directories exists
    motifdir = os.path.join(outpath,"bedMotifs")
    consdir = os.path.join(outpath,"consMotifs")
    if not os.path.isdir(motifdir):
	print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores."
        sys.exit()
    	#os.mkdir(motifdir)
    if not os.path.isdir(consdir):
    	os.mkdir(consdir)
    gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz"))
    consfile = gzip.open(os.path.join(consdir,tfname+motifChrom+'cons.txt.gz'),'w')
    writer = csv.writer(consfile, delimiter='\t')
    l = []
    consTypes = ["phastCons100way","phastCons46way","phastCons46way.placental","phastCons46way.primates",
                "phyloP100way","phyloP46way","phyloP46way.placental","phyloP46way.primate"]

    for consType in consTypes:
        infile = os.path.join(inpath, motifChrom+"."+consType+".wigFix.gz")
        (wigpath,wigfilename) = os.path.split(infile)
        chrom = wigfilename.split('.')[0]
        consName = '_'.join(wigfilename.split('.')[1:-2])

        print 'updating', consName

        gcoordsfile.seek(0)
        gcoords = csv.reader(gcoordsfile, delimiter='\t')

        with gzip.open(infile) as wigFile:
            bwFile = os.path.join(wigpath,motifChrom+"."+consName+'.bw')
            if not os.path.isfile(bwFile):
                countWig.compressFixWig(wigFile, consName, bwFile)
            stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName)
            start = startDict[consName][chrom]
            arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName)
            r = []
	
	    for test in gcoords:
                motifStart, motifEnd = int(test[1]), int(test[2])
                #print motifStart, motifEnd
                avg = 0
                startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])]
                if motifEnd > start[-1]:
                    startlist.append(start[-1])
                for i in xrange(len(startlist)):
                    #if avg != 0:
                        #if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments
                    ss = startlist[i]
                    xs, xvals, sums, ll = arrayDict[ss]
                    if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in
                        if avg == 'NA' and i == len(startlist)-1:
                            avg = 0
                        avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1)
                    elif ss <= motifStart < motifEnd <= ss+ll-1:##in array
                        avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0]
                    elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array
                        if avg == 'NA':
                            avg = 0
                        avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1)
                    elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out
                        if avg == 'NA' and i == len(startlist)-1:
                            avg = 0
                        avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1)
                    elif ss+ll-1 < motifStart:
                        if avg == 0:
                            #print '...', motifStart, motifEnd, ss, ll
                            avg = 'NA'
                    elif motifEnd < ss:
                        print "Error: motifStart < motifEnd < ss "
                        if avg == 0:
                            avg = 'NA'

                r.append(avg)
            l.append(tuple(r))
    wl = zip(*l)

    for i in wl:
        writer.writerows([list(i)])

    consfile.close()

    return 0