Example #1
0
def updateOvlpRegions(inpath,outpath,motifChrom='chr17',window=0, stranded=False):
    """exclude motif sites overlapping ENCODE blacklisted regions"""
    for infile in glob.glob(os.path.join(outpath,"*"+motifChrom+".bed")):
        (filepath,filename) = os.path.split(infile)
        tfname = filename.split(motifChrom)[0]
        gcoordsfile = open(infile,'rt')
        gcoords = csv.reader(gcoordsfile, delimiter='\t')

        for exclinfile in glob.glob(os.path.join(inpath,"*Excludable.bed.gz")):
            #wgEncodeDacMapabilityConsensusExcludable.bed.gz
            (regpath,regfilename) = os.path.split(exclinfile)
            expName = regfilename.split('.')[0]
            #print 'updating', expName
            with gzip.open(exclinfile,'rt') as bedFile:
                bed = csv.reader(bedFile, delimiter='\t')
                annoIntvlDict = countBed.getBed6Anno(bed,expName)
                intervalStartDict = countBed.sortStart(annoIntvlDict)
                intervalEndDict = countBed.sortEnd(annoIntvlDict)
                exclfile = open(os.path.join(outpath+"/"+expName,tfname+motifChrom+'.bed'),'wt')
                writer = csv.writer(exclfile, delimiter='\t')
                if stranded:
                    for test in gcoords:
                        exonList = []
                        motifStart, motifEnd, motifStrand = int(test[1])-window, int(test[2])+window, test[-1]
                        regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
                                intervalStartDict,intervalEndDict,motifChrom,
                                motifStart,motifEnd,window)
                        if valueList != []:##[(exon_name,strand),]
                            for i in xrange(len(valueList)):
                                if valueList[i][1] == motifStrand:
                                    exonList.append(regionList[i])
                        #if exonList != []:
                            #row = ["_".join(exonList)]
                            #writer.writerows([row])
                        if exonList == []:
                            #row = ['ok']
                            writer.writerows([test])
                else:
                    for test in gcoords:
                        motifStart, motifEnd = int(test[1])-window, int(test[2])+window
                        regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
                                intervalStartDict,intervalEndDict,motifChrom,
                                motifStart,motifEnd,window)
                        #if regionList != []:
                            #row = ["_".join(regionList)]
                            #writer.writerows([row])
                        if regionList == []:
                            #row = ['ok']
                            writer.writerows([test])
                exclfile.close()
    return 0
Example #2
0
def updateChip(path,db,motifChrom="chr17",window=0):
    """mark motifs if it overlaps ENCODE excluded regions"""
    mcollection = db["hg19"+motifChrom]
    for infile in glob.glob(os.path.join(path,"*.bed.gz")):
	(regpath,regfilename) = os.path.split(infile)
	expName = regfilename.split(motifChrom)[0]
	with gzip.open(infile,'rt') as bedFile:
	    bed = csv.reader(bedFile, delimiter = '\t') 
	    annoIntvlDict = countBed.getBed4Anno(bed,expName)
	    intervalStartDict = countBed.sortStart(annoIntvlDict)
	    intervalEndDict = countBed.sortEnd(annoIntvlDict)
	    cursor = mcollection.find({"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}})
		#{"tf_name": tfName})
	    for test in cursor:
		motifStart, motifEnd = test["genomic_region"]["start"], \
					test["genomic_region"]["end"]
	        regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
				intervalStartDict,intervalEndDict,motifChrom,
				motifStart,motifEnd,window)
		if valueList != []:
			#print regionList, valueList, motifChrom, motifStart, motifEnd
			mcollection.update({"_id": test["_id"]}, 
			{"$set": {"chip."+expName: valueList[0][1]}}, upsert = True)
	    	
    return 0
Example #3
0
def updateChip(path,db,motifChrom="chr17",window=0):
    """mark motifs if it overlaps ENCODE excluded regions"""
    mcollection = db["hg19"+motifChrom]
    for tf in ["CTCF"]:#os.listdir(path):#["CTCF","JUND","MAX","REST","SIN3A","SP1","USF1","YY1"]:
	files = path+"/"+tf+"/wgEncodeBroadHistoneGm12878CtcfStdPkForma*"+motifChrom+".bed.gz"
        for infile in glob.glob(files):
	    print "update", infile
	    (regpath,regfilename) = os.path.split(infile)
	    expName = regfilename.split(motifChrom)[0]
	    with gzip.open(infile,'rt') as bedFile:
	        bed = csv.reader(bedFile, delimiter = '\t') 
	    	annoIntvlDict = countBed.getBed4Anno(bed,expName)
	    	intervalStartDict = countBed.sortStart(annoIntvlDict)
	    	intervalEndDict = countBed.sortEnd(annoIntvlDict)
	    	cursor = mcollection.find({"tf_name": tf})
	        for test in cursor:
		    motifStart, motifEnd = test["genomic_region"]["start"], \
					test["genomic_region"]["end"]
	            regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
				intervalStartDict,intervalEndDict,motifChrom,
				motifStart,motifEnd,window)
		    if valueList != []:
			#print regionList, valueList, motifChrom, motifStart, motifEnd
			mcollection.update({"_id": test["_id"]}, 
			{"$set": {"chip."+expName: valueList[0][1]}}, upsert = True)
	    	
    return 0
Example #4
0
def updateOvlpRegions(path,db,motifChrom='chr17',window=0, stranded=True):
    mcollection = db["hg19"+motifChrom]
    for infile in glob.glob(os.path.join(path,"hg19_codingExonCanonical_*.bed.gz")):
	expName="exon"#"exon1st"
	#(regpath,regfilename) = os.path.split(infile)
	#expName = regfilename.split('.')[0]
	print 'updating', expName
	with gzip.open(infile,'rt') as bedFile:
	    bed = csv.reader(bedFile, delimiter='\t')
	    annoIntvlDict = countBed.getBed6Anno(bed,expName)
	    intervalStartDict = countBed.sortStart(annoIntvlDict)
	    intervalEndDict = countBed.sortEnd(annoIntvlDict)
	    cursor = mcollection.find()#{"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}})
	    if stranded:
	        for test in cursor:
		    exonList = []
		    motifStart, motifEnd, motifStrand = test["genomic_region"]["start"], \
					test["genomic_region"]["end"], \
					test["genomic_region"]["strand"]
		    regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
				intervalStartDict,intervalEndDict,motifChrom,
				motifStart,motifEnd,window)
		    if valueList != []:##[(exon_name,strand),]
			#if len(regionList) < len(valueList):
				#print motifStart, motifEnd, motifStrand, valueList, regionList
			for i in xrange(len(valueList)):
			    if valueList[i][1] == motifStrand:
				if len(regionList) == len(valueList):
				    exonList.append(regionList[i])
				else:
				    exonList.append(regionList[i-1])
		    if exonList != []:
			mcollection.update({"_id": test["_id"]},
				{"$set":{expName: exonList}}, upsert = True)
	    else:
		for test in cursor:
		    motifStart, motifEnd = test["genomic_region"]["start"], \
				test["genomic_region"]["end"]
		    regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
				intervalStartDict,intervalEndDict,motifChrom,
				motifStart,motifEnd,window)
		    if regionList != []:
			mcollection.update({"_id": test["_id"]},
				{"$set":{"map."+expName: regionList}}, upsert = True)
    return 0
Example #5
0
def updateExcludedRegions(path,tfName,window):
    """mark motifs if it overlaps ENCODE excluded regions"""
    for infile in glob.glob(os.path.join(path,"*.bed.gz")):
	(regpath,regfilename) = os.path.split(infile)
	expName = regfilename.split('.')[0]
	with gzip.open(infile,'rt') as bedFile:
	    bed = csv.reader(bedFile, delimiter = '\t') 
	    annoIntvlDict = countBed.getBed6Anno(bed,expName)
	    intervalDict = countBed.sortInterval(annoIntvlDict)
	    cursor = mcollection.find({"tf_name": tfName})
	    for test in cursor:
		motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], \
						test["motif_genomic_regions_info"]["start"], \
						test["motif_genomic_regions_info"]["end"]
	        regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
				intervalDict,motifChrom,motifStart,motifEnd,window)
		if regionList != []:
			print regionList, valueList, motifChrom, motifStart, motifEnd
		mcollection.update({"_id": test["_id"]}, 
			{"$set": {"motif_mapability_info":{"exclude": regionList}}}, upsert = True)
	    	
    return 0