def updateOvlpRegions(inpath,outpath,motifChrom='chr17',window=0, stranded=False): """exclude motif sites overlapping ENCODE blacklisted regions""" for infile in glob.glob(os.path.join(outpath,"*"+motifChrom+".bed")): (filepath,filename) = os.path.split(infile) tfname = filename.split(motifChrom)[0] gcoordsfile = open(infile,'rt') gcoords = csv.reader(gcoordsfile, delimiter='\t') for exclinfile in glob.glob(os.path.join(inpath,"*Excludable.bed.gz")): #wgEncodeDacMapabilityConsensusExcludable.bed.gz (regpath,regfilename) = os.path.split(exclinfile) expName = regfilename.split('.')[0] #print 'updating', expName with gzip.open(exclinfile,'rt') as bedFile: bed = csv.reader(bedFile, delimiter='\t') annoIntvlDict = countBed.getBed6Anno(bed,expName) intervalStartDict = countBed.sortStart(annoIntvlDict) intervalEndDict = countBed.sortEnd(annoIntvlDict) exclfile = open(os.path.join(outpath+"/"+expName,tfname+motifChrom+'.bed'),'wt') writer = csv.writer(exclfile, delimiter='\t') if stranded: for test in gcoords: exonList = [] motifStart, motifEnd, motifStrand = int(test[1])-window, int(test[2])+window, test[-1] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalStartDict,intervalEndDict,motifChrom, motifStart,motifEnd,window) if valueList != []:##[(exon_name,strand),] for i in xrange(len(valueList)): if valueList[i][1] == motifStrand: exonList.append(regionList[i]) #if exonList != []: #row = ["_".join(exonList)] #writer.writerows([row]) if exonList == []: #row = ['ok'] writer.writerows([test]) else: for test in gcoords: motifStart, motifEnd = int(test[1])-window, int(test[2])+window regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalStartDict,intervalEndDict,motifChrom, motifStart,motifEnd,window) #if regionList != []: #row = ["_".join(regionList)] #writer.writerows([row]) if regionList == []: #row = ['ok'] writer.writerows([test]) exclfile.close() return 0
def updateChip(path,db,motifChrom="chr17",window=0): """mark motifs if it overlaps ENCODE excluded regions""" mcollection = db["hg19"+motifChrom] for infile in glob.glob(os.path.join(path,"*.bed.gz")): (regpath,regfilename) = os.path.split(infile) expName = regfilename.split(motifChrom)[0] with gzip.open(infile,'rt') as bedFile: bed = csv.reader(bedFile, delimiter = '\t') annoIntvlDict = countBed.getBed4Anno(bed,expName) intervalStartDict = countBed.sortStart(annoIntvlDict) intervalEndDict = countBed.sortEnd(annoIntvlDict) cursor = mcollection.find({"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}}) #{"tf_name": tfName}) for test in cursor: motifStart, motifEnd = test["genomic_region"]["start"], \ test["genomic_region"]["end"] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalStartDict,intervalEndDict,motifChrom, motifStart,motifEnd,window) if valueList != []: #print regionList, valueList, motifChrom, motifStart, motifEnd mcollection.update({"_id": test["_id"]}, {"$set": {"chip."+expName: valueList[0][1]}}, upsert = True) return 0
def updateChip(path,db,motifChrom="chr17",window=0): """mark motifs if it overlaps ENCODE excluded regions""" mcollection = db["hg19"+motifChrom] for tf in ["CTCF"]:#os.listdir(path):#["CTCF","JUND","MAX","REST","SIN3A","SP1","USF1","YY1"]: files = path+"/"+tf+"/wgEncodeBroadHistoneGm12878CtcfStdPkForma*"+motifChrom+".bed.gz" for infile in glob.glob(files): print "update", infile (regpath,regfilename) = os.path.split(infile) expName = regfilename.split(motifChrom)[0] with gzip.open(infile,'rt') as bedFile: bed = csv.reader(bedFile, delimiter = '\t') annoIntvlDict = countBed.getBed4Anno(bed,expName) intervalStartDict = countBed.sortStart(annoIntvlDict) intervalEndDict = countBed.sortEnd(annoIntvlDict) cursor = mcollection.find({"tf_name": tf}) for test in cursor: motifStart, motifEnd = test["genomic_region"]["start"], \ test["genomic_region"]["end"] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalStartDict,intervalEndDict,motifChrom, motifStart,motifEnd,window) if valueList != []: #print regionList, valueList, motifChrom, motifStart, motifEnd mcollection.update({"_id": test["_id"]}, {"$set": {"chip."+expName: valueList[0][1]}}, upsert = True) return 0
def updateOvlpRegions(path,db,motifChrom='chr17',window=0, stranded=True): mcollection = db["hg19"+motifChrom] for infile in glob.glob(os.path.join(path,"hg19_codingExonCanonical_*.bed.gz")): expName="exon"#"exon1st" #(regpath,regfilename) = os.path.split(infile) #expName = regfilename.split('.')[0] print 'updating', expName with gzip.open(infile,'rt') as bedFile: bed = csv.reader(bedFile, delimiter='\t') annoIntvlDict = countBed.getBed6Anno(bed,expName) intervalStartDict = countBed.sortStart(annoIntvlDict) intervalEndDict = countBed.sortEnd(annoIntvlDict) cursor = mcollection.find()#{"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}}) if stranded: for test in cursor: exonList = [] motifStart, motifEnd, motifStrand = test["genomic_region"]["start"], \ test["genomic_region"]["end"], \ test["genomic_region"]["strand"] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalStartDict,intervalEndDict,motifChrom, motifStart,motifEnd,window) if valueList != []:##[(exon_name,strand),] #if len(regionList) < len(valueList): #print motifStart, motifEnd, motifStrand, valueList, regionList for i in xrange(len(valueList)): if valueList[i][1] == motifStrand: if len(regionList) == len(valueList): exonList.append(regionList[i]) else: exonList.append(regionList[i-1]) if exonList != []: mcollection.update({"_id": test["_id"]}, {"$set":{expName: exonList}}, upsert = True) else: for test in cursor: motifStart, motifEnd = test["genomic_region"]["start"], \ test["genomic_region"]["end"] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalStartDict,intervalEndDict,motifChrom, motifStart,motifEnd,window) if regionList != []: mcollection.update({"_id": test["_id"]}, {"$set":{"map."+expName: regionList}}, upsert = True) return 0
def updateExcludedRegions(path,tfName,window): """mark motifs if it overlaps ENCODE excluded regions""" for infile in glob.glob(os.path.join(path,"*.bed.gz")): (regpath,regfilename) = os.path.split(infile) expName = regfilename.split('.')[0] with gzip.open(infile,'rt') as bedFile: bed = csv.reader(bedFile, delimiter = '\t') annoIntvlDict = countBed.getBed6Anno(bed,expName) intervalDict = countBed.sortInterval(annoIntvlDict) cursor = mcollection.find({"tf_name": tfName}) for test in cursor: motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], \ test["motif_genomic_regions_info"]["start"], \ test["motif_genomic_regions_info"]["end"] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalDict,motifChrom,motifStart,motifEnd,window) if regionList != []: print regionList, valueList, motifChrom, motifStart, motifEnd mcollection.update({"_id": test["_id"]}, {"$set": {"motif_mapability_info":{"exclude": regionList}}}, upsert = True) return 0