def run(directory, bidirectionalfilepath, homedir): fimodirectorylist = Functions.fimo_directories(directory) counts = dict() for fimo in fimodirectorylist: TF = fimo.split('/')[5] if TF not in counts: counts[TF] = [] fimoname = fimo.split('/')[9] fimofile = fimo + "/fimo.cut.rmdup.ord.merge.bed" chipfile = Functions.parent_dir(Functions.parent_dir(fimo)) + '/ConsolidatedPeaks.merge.bed' vennlist = Functions.venn_d3(bidirectionalfilepath, chipfile, fimofile) counts[TF].append([fimoname, vennlist]) os.chdir(homedir) os.chdir('..') os.chdir("./files") outfile = open("BidirChIpMotifOverlaps.txt",'w') outfile.write("TF\nMotif#\nBidir\tChip\tMotif\tBC\tCB\tBM\tMB\tCM\tMC\tBCM\tCBM\tMBC\n") for key in counts: outfile.write(key) outfile.write("\n") for item in counts[key]: outfile.write(item[0]) outfile.write("\n") for value in item[1]: outfile.write(value) outfile.write("\t") outfile.write("\n")
def run(): #Set home directory homedir = os.path.dirname(os.path.realpath(__file__)) #Get full path to reference genome file (must be in files folder) #referencefilepath = Functions.parent_dir(homedir) + '/files/hg19_whole_genome.fa' #Get full path to bidirectional hits file (must be in files folder) bidirectionalfilepath = Functions.parent_dir(homedir) + '/files/bidirectional_hits.merge.bed' #Get full path to motif database for tomtom (must be in files folder) tomtomdir = Functions.parent_dir(homedir) + '/files/HOCOMOCOv9_AD_MEME.txt' if boolean == True: print "Cleaning directory..." #Deletes all files and folders in given directory/TF/peak_files cl.run(directory) print "running main\npreparing files for MEME..." #Bedtools intersect on all *.bed* files , then bedtools merge to ensure non-overlapping intervals rc.run(directory) #Converts ConsolidatedPeak.merge.bed to ConsolidatedPeak.merge.fasta b2f.run(directory, referencefilepath) print "done\nrunning MEME..." #Runs MEME, FIMO, and TOMTOM on all ConsolidatedPeak.merge.fasta meme.run(directory, 10000000, 10000000, tomtomdir) print "done\nfixing FIMO files..." #Removes duplicates, orders, and eliminates first column of FIMO output files ff.run(directory) print "done\ngetting motif distances to i..." #Calculates motif distance to bidir center for each motif of each TF dist.run(directory, bidirectionalfilepath, homedir) print "done\ngenerating overlap numbers..." #Determines site overlap between bidir, ChIP, and FIMO sites so.run(directory, bidirectionalfilepath, homedir) print "done"
for directory in Functions.TFIT_fimo_directories(TFITDir): for bidirfile in os.listdir(directory): if 'GSM' not in bidirfile and 'SRR' in bidirfile: TFITDict[bidirfile] = list() file1 = open(directory + '/' + bidirfile) file1.readline() for line in file1: if len(line.strip().split()[0:2]) == 2: TF,pval = line.strip().split()[0:2] TFITDict[bidirfile].append((TF,pval)) return TFITDict if __name__ == "__main__": #Specify directory TFITDir = '/scratch/Users/joru1876/TFIT' TFITDict = run(TFITDir) os.chdir(Functions.parent_dir(TFITDir)) outfile = open('ExpTFMatrixMasterFile.txt','w') for exp in TFITDict: outfile.write(exp) outfile.write('\t') for val in TFITDict[exp]: outfile.write(val[0]) outfile.write(',') outfile.write(val[1]) outfile.write(',') outfile.write('\n')
TFlist = TF.split('/') FIMOTFDict[TFlist[len(TFlist)-1][0:TFlist[len(TFlist)-1].index('_')]] = TF ChipDirList = Functions.chip_bedgraph_directories(ChipDir) for directory in ChipDirList: directorylist = directory.split('/') TF = directorylist[len(directorylist)-2] if TF in FIMOTFDict: print TF ChipFile = directory + '/' + [i for i in os.listdir(directory) if 'ENC' in i][0] print ChipFile,BidirFile print FIMOTFDict[TF] + '/fimo.txt' BackgroundDict, FnoBDict, FandBDict = run(BidirFile,ChipFile,FIMOTFDict[TF] + '/fimo.txt') #if not os.path.exists(Functions.parent_dir(directory) + '/ChIPMotifValidator_out'): os.mkdir(Functions.parent_dir(directory) + '/ChIPMotifValidator_out') outfile1 = open(Functions.parent_dir(directory) + '/ChIPMotifValidator_out/'+'Background100.txt','w') for chrom in BackgroundDict: outfile1.write(chrom) outfile1.write('\t') outfile1.write(BackgroundDict[chrom]) outfile1.write('\n') outfile2 = open(Functions.parent_dir(directory) + '/ChIPMotifValidator_out/' + 'FnoB100.txt','w') for chrom in FnoBDict: outfile2.write(chrom) outfile2.write('\t') outfile2.write(FnoBDict[chrom]) outfile2.write('\n') outfile3 = open(Functions.parent_dir(directory) + '/ChIPMotifValidator_out/' + 'FandB100.txt','w') for chrom in FandBDict: outfile3.write(chrom)
FIMOTFDict = dict() for TF in Functions.HOCOMOCO_fimo_directories(FimoDir): TFlist = TF.split('/') FIMOTFDict[TFlist[len(TFlist)-1][0:TFlist[len(TFlist)-1].index('_')]] = TF ChipDirList = Functions.chip_peak_directories(ChipDir) for directory in ChipDirList: directorylist = directory.split('/') TF = directorylist[len(directorylist)-2] if TF in FIMOTFDict: print TF ChipFile = directory + '/' + [i for i in os.listdir(directory) if 'ENC' in i][0] BackgroundDict, FnoBDict, FandBDict = run(BidirFile,ChipFile,FIMOTFDict[TF] + '/fimo.txt') if not os.path.exists(Functions.parent_dir(directory) + '/ChIPPeakMotifValidator_out'): os.mkdir(Functions.parent_dir(directory) + '/ChIPPeakMotifValidator_out') os.chdir(Functions.parent_dir(directory) + '/ChIPPeakMotifValidator_out') outfile1 = open('Background.txt','w') for chrom in BackgroundDict: outfile1.write(chrom) outfile1.write('\t') outfile1.write(BackgroundDict[chrom]) outfile1.write('\n') outfile2 = open('FnoB.txt','w') for chrom in FnoBDict: outfile2.write(chrom) outfile2.write('\t') outfile2.write(FnoBDict[chrom]) outfile2.write('\n') outfile3 = open('FandB.txt','w')
for TF in os.listdir(rootdirectory): if os.path.exists(rootdirectory + "/" + TF + "/peak_files/MEME"): os.chdir(rootdirectory + "/" + TF + "/peak_files/MEME") FileList = [item for item in os.listdir(os.getcwd()) if 'fimo_out' in item] for fimofolder in FileList: directorylist.append(rootdirectory + "/" + TF + "/peak_files/MEME/" + fimofolder) return directorylist directory = '/projects/dowellLab/ENCODE/HCT116' bidirectionalfilepath = '/Users/joru1876/ENCODEBidirectional/bidirectional_hits.merge.bed' homedir = '/Users/joru1876/test' fimodirectorylist = fimo_directories(directory) counts = dict() for fimo in fimodirectorylist: if os.path.exists(Functions.parent_dir(Functions.parent_dir(fimo)) + '/Consolidatedpeaks.ord.merge.bed'): TF = fimo.split('/')[5] if TF not in counts: counts[TF] = [] fimoname = fimo.split('/')[8] fimofile = fimo + "/fimo.rmdup.ord.cut.merge.bed" chipfile = Functions.parent_dir(Functions.parent_dir(fimo)) + '/Consolidatedpeaks.ord.merge.bed' vennlist = Functions.venn_d3(bidirectionalfilepath, False, chipfile, False, fimofile, True) counts[TF].append([fimoname, vennlist]) os.chdir(homedir) outfile = open("BidirChIpMotifOverlaps.txt",'w') outfile.write("TF\nMotif#\nBidir\tChip\tMotif\tBC\tCB\tBM\tMB\tCM\tMC\tBCM\tCBM\tMBC\n") for key in counts: outfile.write(key) outfile.write("\n")
fimodir = '/Users/joru1876/HOCOMOCODatabaseFIMO/FIMO_OUT' bidirDir = '/projects/dowellLab/TFIT' for exp in os.listdir(bidirDir): print exp if exp != 'genome_files': if os.path.exists(bidirDir + '/' + exp + '/EMG_out_files'): bidirfileDir = bidirDir + '/' + exp + '/EMG_out_files' bidirfiles = [bidirfileDir + '/' + bidir for bidir in os.listdir(bidirfileDir) if 'bidirectional_hits' in bidir] else: bidirfiles = [bidirDir + '/' + bidir for bidir in os.listdir(bidirfileDir) if 'bidirectional_hits' in bidir] for bidirfile in bidirfiles: print bidirfile if 'EMG_out_files' in bidirfile: outfiledir = Functions.parent_dir(Functions.parent_dir(bidirfile)) else: outfiledir = Functions.parent_dir(bidirfile) if not os.path.exists(outfiledir + '/FIMO_OUT'): os.mkdir(outfiledir + '/FIMO_OUT') distances = run(bidirfile, fimodir) sorted_distances = sorted(distances.items(), key=itemgetter(1)) outfile = open(outfiledir + '/FIMO_OUT/' + bidirfile.split('/')[6][0:bidirfile.split('/')[6].index('.')] + '.txt', 'w') outfile.write("TF\tUniform p-val\tCentered(0) p-val\tBimodality (1=True)\tDistance List") outfile.write("\n") for item in sorted_distances: outfile.write(str(item[0])) outfile.write("\t") outfile.write(str(item[1][0]))
return bidirdict if __name__ == "__main__": ##Returns fasta file with sequences within windowsize of i #Specify windowsize: windowsize = 6 #Specify TFIT directory TFIT = '/scratch/Shares/dowell/TFIT' #Specify reference fasta file referencefilepath = '/scratch/Shares/dowell/pubgro/genomefiles/human/hg19/hg19ucsc/hg19_all.fa' for directory in Functions.TFIT_EMG_OUT_directories(TFIT): for bidirfile in os.listdir(directory): if 'bed' in bidirfile: print bidirfile bidirdict = run(directory + '/' + bidirfile, windowsize) if not os.path.exists(Functions.parent_dir(directory) + '/WindowSeq_out'): os.mkdir(Functions.parent_dir(directory) + '/WindowSeq_out') os.chdir(Functions.parent_dir(directory) + '/WindowSeq_out') outfile = open(bidirfile[0:bidirfile.index('.')] + '.wseq.bed','w') for chrom in bidirdict: for tup in bidirdict[chrom]: start, stop = tup outfile.write(chrom) outfile.write('\t') outfile.write(str(start)) outfile.write('\t') outfile.write(str(stop)) outfile.write('\n') os.system("bedtools getfasta -fi " + referencefilepath + " -bed " + bidirfile[0:bidirfile.index('.')] + ".wseq.bed -fo " + bidirfile[0:bidirfile.index('.')] + '.wseq.fasta')