def run(directory, bidirectionalfilepath, homedir):
    fimodirectorylist = Functions.fimo_directories(directory)
    counts = dict()
    for fimo in fimodirectorylist:
        TF = fimo.split('/')[5]
        if TF not in counts:
            counts[TF] = []
        fimoname = fimo.split('/')[9]
        fimofile = fimo + "/fimo.cut.rmdup.ord.merge.bed"
        chipfile = Functions.parent_dir(Functions.parent_dir(fimo)) + '/ConsolidatedPeaks.merge.bed'
        vennlist = Functions.venn_d3(bidirectionalfilepath, chipfile, fimofile) 
        counts[TF].append([fimoname, vennlist])
    
    os.chdir(homedir)
    os.chdir('..')
    os.chdir("./files")
    outfile = open("BidirChIpMotifOverlaps.txt",'w')
    outfile.write("TF\nMotif#\nBidir\tChip\tMotif\tBC\tCB\tBM\tMB\tCM\tMC\tBCM\tCBM\tMBC\n")
    for key in counts:
        outfile.write(key)
        outfile.write("\n")
        for item in counts[key]:
            outfile.write(item[0])
            outfile.write("\n")
            for value in item[1]:  
                outfile.write(value)
                outfile.write("\t")
            outfile.write("\n")
예제 #2
0
def run():
    #Set home directory
    homedir = os.path.dirname(os.path.realpath(__file__))
    #Get full path to reference genome file (must be in files folder)
    #referencefilepath = Functions.parent_dir(homedir) + '/files/hg19_whole_genome.fa'
    #Get full path to bidirectional hits file (must be in files folder)
    bidirectionalfilepath = Functions.parent_dir(homedir) + '/files/bidirectional_hits.merge.bed'
    #Get full path to motif database for tomtom (must be in files folder)
    tomtomdir = Functions.parent_dir(homedir) + '/files/HOCOMOCOv9_AD_MEME.txt'
    if boolean == True:
        print "Cleaning directory..."
        #Deletes all files and folders in given directory/TF/peak_files
        cl.run(directory)
    print "running main\npreparing files for MEME..."
    #Bedtools intersect on all *.bed* files , then bedtools merge to ensure non-overlapping intervals
    rc.run(directory)
    #Converts ConsolidatedPeak.merge.bed to ConsolidatedPeak.merge.fasta
    b2f.run(directory, referencefilepath)
    print "done\nrunning MEME..."
    #Runs MEME, FIMO, and TOMTOM on all ConsolidatedPeak.merge.fasta
    meme.run(directory, 10000000, 10000000, tomtomdir)
    print "done\nfixing FIMO files..."
    #Removes duplicates, orders, and eliminates first column of FIMO output files
    ff.run(directory)
    print "done\ngetting motif distances to i..."
    #Calculates motif distance to bidir center for each motif of each TF
    dist.run(directory, bidirectionalfilepath, homedir)
    print "done\ngenerating overlap numbers..."
    #Determines site overlap between bidir, ChIP, and FIMO sites
    so.run(directory, bidirectionalfilepath, homedir)
    print "done"
    
    for directory in Functions.TFIT_fimo_directories(TFITDir):
        for bidirfile in os.listdir(directory):
            if 'GSM' not in bidirfile and 'SRR' in bidirfile:
                TFITDict[bidirfile] = list()
                file1 = open(directory + '/' + bidirfile)
                file1.readline()
                for line in file1:
                    if len(line.strip().split()[0:2]) == 2:
                        TF,pval = line.strip().split()[0:2]
                        TFITDict[bidirfile].append((TF,pval))
        
    
    return TFITDict
    
if __name__ == "__main__":
    #Specify directory
    TFITDir = '/scratch/Users/joru1876/TFIT'
    
    TFITDict = run(TFITDir)
    
    os.chdir(Functions.parent_dir(TFITDir))
    outfile = open('ExpTFMatrixMasterFile.txt','w')
    for exp in TFITDict:
        outfile.write(exp)
        outfile.write('\t')
        for val in TFITDict[exp]:
            outfile.write(val[0])
            outfile.write(',')
            outfile.write(val[1])
            outfile.write(',')
        outfile.write('\n')
        TFlist = TF.split('/')
        FIMOTFDict[TFlist[len(TFlist)-1][0:TFlist[len(TFlist)-1].index('_')]] = TF
    
    ChipDirList = Functions.chip_bedgraph_directories(ChipDir)
    for directory in ChipDirList:
        directorylist = directory.split('/')
        TF = directorylist[len(directorylist)-2]
        if TF in FIMOTFDict:
	    print TF
            ChipFile = directory + '/' + [i for i in os.listdir(directory) if 'ENC' in i][0]
            print ChipFile,BidirFile
            print FIMOTFDict[TF] + '/fimo.txt'
            BackgroundDict, FnoBDict, FandBDict = run(BidirFile,ChipFile,FIMOTFDict[TF] + '/fimo.txt')
            
            #if not os.path.exists(Functions.parent_dir(directory) + '/ChIPMotifValidator_out'):
            os.mkdir(Functions.parent_dir(directory) + '/ChIPMotifValidator_out')
            outfile1 = open(Functions.parent_dir(directory) + '/ChIPMotifValidator_out/'+'Background100.txt','w')
            for chrom in BackgroundDict:
                outfile1.write(chrom)
                outfile1.write('\t')
                outfile1.write(BackgroundDict[chrom])
                outfile1.write('\n')
            outfile2 = open(Functions.parent_dir(directory) + '/ChIPMotifValidator_out/' + 'FnoB100.txt','w')
            for chrom in FnoBDict:
                outfile2.write(chrom)
                outfile2.write('\t')
                outfile2.write(FnoBDict[chrom])
                outfile2.write('\n')
            outfile3 = open(Functions.parent_dir(directory) + '/ChIPMotifValidator_out/' + 'FandB100.txt','w')
            for chrom in FandBDict:
                outfile3.write(chrom)
    
    FIMOTFDict = dict()
    for TF in Functions.HOCOMOCO_fimo_directories(FimoDir):
        TFlist = TF.split('/')
        FIMOTFDict[TFlist[len(TFlist)-1][0:TFlist[len(TFlist)-1].index('_')]] = TF
    
    ChipDirList = Functions.chip_peak_directories(ChipDir)
    for directory in ChipDirList:
        directorylist = directory.split('/')
        TF = directorylist[len(directorylist)-2]
        if TF in FIMOTFDict:
	    print TF
            ChipFile = directory + '/' + [i for i in os.listdir(directory) if 'ENC' in i][0]
            BackgroundDict, FnoBDict, FandBDict = run(BidirFile,ChipFile,FIMOTFDict[TF] + '/fimo.txt')
            
            if not os.path.exists(Functions.parent_dir(directory) + '/ChIPPeakMotifValidator_out'):
                os.mkdir(Functions.parent_dir(directory) + '/ChIPPeakMotifValidator_out')
            os.chdir(Functions.parent_dir(directory) + '/ChIPPeakMotifValidator_out')
            outfile1 = open('Background.txt','w')
            for chrom in BackgroundDict:
                outfile1.write(chrom)
                outfile1.write('\t')
                outfile1.write(BackgroundDict[chrom])
                outfile1.write('\n')
            outfile2 = open('FnoB.txt','w')
            for chrom in FnoBDict:
                outfile2.write(chrom)
                outfile2.write('\t')
                outfile2.write(FnoBDict[chrom])
                outfile2.write('\n')
            outfile3 = open('FandB.txt','w')
     for TF in os.listdir(rootdirectory):
         if os.path.exists(rootdirectory + "/" + TF + "/peak_files/MEME"):
             os.chdir(rootdirectory + "/" + TF + "/peak_files/MEME")
             FileList = [item for item in os.listdir(os.getcwd()) if 'fimo_out' in item]
             for fimofolder in FileList:
                 directorylist.append(rootdirectory + "/" + TF + "/peak_files/MEME/" + fimofolder)
     
     return directorylist
 
 directory = '/projects/dowellLab/ENCODE/HCT116'
 bidirectionalfilepath = '/Users/joru1876/ENCODEBidirectional/bidirectional_hits.merge.bed'
 homedir = '/Users/joru1876/test'
 fimodirectorylist = fimo_directories(directory)
 counts = dict()
 for fimo in fimodirectorylist:
     if os.path.exists(Functions.parent_dir(Functions.parent_dir(fimo)) + '/Consolidatedpeaks.ord.merge.bed'):
         TF = fimo.split('/')[5]
         if TF not in counts:
             counts[TF] = []
         fimoname = fimo.split('/')[8]
         fimofile = fimo + "/fimo.rmdup.ord.cut.merge.bed"
         chipfile = Functions.parent_dir(Functions.parent_dir(fimo)) + '/Consolidatedpeaks.ord.merge.bed'
         vennlist = Functions.venn_d3(bidirectionalfilepath, False, chipfile, False, fimofile, True) 
         counts[TF].append([fimoname, vennlist])
 
 os.chdir(homedir)
 outfile = open("BidirChIpMotifOverlaps.txt",'w')
 outfile.write("TF\nMotif#\nBidir\tChip\tMotif\tBC\tCB\tBM\tMB\tCM\tMC\tBCM\tCBM\tMBC\n")
 for key in counts:
     outfile.write(key)
     outfile.write("\n")
 fimodir = '/Users/joru1876/HOCOMOCODatabaseFIMO/FIMO_OUT'
 bidirDir = '/projects/dowellLab/TFIT'
 
 for exp in os.listdir(bidirDir):
     print exp
     if exp != 'genome_files':
         if os.path.exists(bidirDir + '/' + exp + '/EMG_out_files'):
             bidirfileDir = bidirDir + '/' + exp + '/EMG_out_files'
             bidirfiles = [bidirfileDir + '/' + bidir for bidir in os.listdir(bidirfileDir) if 'bidirectional_hits' in bidir]
         else:
             bidirfiles = [bidirDir + '/' + bidir for bidir in os.listdir(bidirfileDir) if 'bidirectional_hits' in bidir]
 
         for bidirfile in bidirfiles:
             print bidirfile
             if 'EMG_out_files' in bidirfile:
                 outfiledir = Functions.parent_dir(Functions.parent_dir(bidirfile))
             else:
                 outfiledir = Functions.parent_dir(bidirfile)
             if not os.path.exists(outfiledir + '/FIMO_OUT'):
                 os.mkdir(outfiledir + '/FIMO_OUT')
                 
             
             distances = run(bidirfile, fimodir)
             sorted_distances = sorted(distances.items(), key=itemgetter(1))
             outfile = open(outfiledir + '/FIMO_OUT/' + bidirfile.split('/')[6][0:bidirfile.split('/')[6].index('.')] + '.txt', 'w')
             outfile.write("TF\tUniform p-val\tCentered(0) p-val\tBimodality (1=True)\tDistance List")
             outfile.write("\n")
             for item in sorted_distances:
                 outfile.write(str(item[0]))
                 outfile.write("\t")
                 outfile.write(str(item[1][0]))
    
    return bidirdict
    
if __name__ == "__main__":
    ##Returns fasta file with sequences within windowsize of i
    #Specify windowsize:
    windowsize = 6
    #Specify TFIT directory
    TFIT = '/scratch/Shares/dowell/TFIT'
    #Specify reference fasta file
    referencefilepath = '/scratch/Shares/dowell/pubgro/genomefiles/human/hg19/hg19ucsc/hg19_all.fa'
    
    for directory in Functions.TFIT_EMG_OUT_directories(TFIT):
        for bidirfile in os.listdir(directory):
            if 'bed' in bidirfile:
                print bidirfile
                bidirdict = run(directory + '/' + bidirfile, windowsize)
                if not os.path.exists(Functions.parent_dir(directory) + '/WindowSeq_out'):
                    os.mkdir(Functions.parent_dir(directory) + '/WindowSeq_out')
                os.chdir(Functions.parent_dir(directory) + '/WindowSeq_out')
                outfile = open(bidirfile[0:bidirfile.index('.')] + '.wseq.bed','w')
                for chrom in bidirdict:
                    for tup in bidirdict[chrom]:
                        start, stop = tup
                        outfile.write(chrom)
                        outfile.write('\t')
                        outfile.write(str(start))
                        outfile.write('\t')
                        outfile.write(str(stop))
                        outfile.write('\n')
                os.system("bedtools getfasta -fi " + referencefilepath + " -bed " + bidirfile[0:bidirfile.index('.')] + ".wseq.bed -fo " + bidirfile[0:bidirfile.index('.')] + '.wseq.fasta')