def findGeneTargets(genome, regionsTn, upFlankSize, downFlankSize, galaxyFn):
        assert genome == 'hg18'
        #tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        #tfTrackName = tfTrackNameMappings[tfSource] + [selectedTF]
        
        geneIntersection = GeneIntersection(genome, 'Ensembl', regionsTn, galaxyFn)
        geneIntersection.expandReferenceTrack(upFlankSize, downFlankSize)
        expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else ''                
        #print '<p>There are %i Ensemble genes being targets of your selected TF (%s), based on intersecting TF target positions with%s %sgene regions.</p>' % (geneIntersection.getNumberOfIntersectedBins(), selectedTF, expansionStr, 'Ensembl')
        print '<p>There are %i Ensemble genes being targets of your selected regions, based on intersecting your supplied regions with%s %sgene regions.</p>' % (geneIntersection.getNumberOfIntersectedBins(), expansionStr, 'Ensembl')
        
        idFileNamer = geneIntersection.getGeneIdStaticFileWithContent()
        print '<p>', idFileNamer.getLink('Download list'), ' of all Ensemble IDs with 1 or more hits.</p>' 

        regFileNamer = geneIntersection.getIntersectedRegionsStaticFileWithContent()
        print '<p>', regFileNamer.getLink('Download bed-file'), ' of all Ensembl gene regions with 1 or more hits.</p>' 
        
        targetBins = geneIntersection.getIntersectedReferenceBins()
        res = geneIntersection.getIntersectionResult()
        resDictKey = geneIntersection.getUniqueResDictKey()
        setOfNumOccurrences = set([res[bin][resDictKey] for bin in targetBins])

        byNumOccurrencesStaticFile = GalaxyRunSpecificFile(['genes_by_num_occurrences.html'], galaxyFn)
        f = byNumOccurrencesStaticFile.getFile()
        for numOccurrences in reversed(sorted(setOfNumOccurrences)):                    
            f.write('Gene regions having %i occurrences:<br>' % numOccurrences + '<br>' + os.linesep)
            f.write(', '.join([ '<a href=http://www.ensembl.org/Homo_sapiens/Gene/Summary?g='+str(bin.val).split('|')[0]+'>'+str(bin.val).split('|')[0]+'</a>' for bin in targetBins if res[bin][resDictKey]==numOccurrences]) + '<br><br>' + os.linesep)
        f.close()
        
        print '</p>Inspect list of all intersected genes (by ID), ', byNumOccurrencesStaticFile.getLink('ordered by number of occurrences') + ' inside, and with links to gene descriptions.<br>'
    def convertToGTrack(self, filePath, fileFormat, galaxyFn, fastaFilePath=None, normalizeValues=False):
        
        predictionFile = open(filePath, 'r')
        
        out = GalaxyRunSpecificFile(['%smodified.gtrack' % filePath.split('/')[-1]], galaxyFn)
        gtrackFile = out.getFile('w')

        if fileFormat == 'weeder':
            self._convertFromWeederToGTrack(predictionFile, gtrackFile)
        elif fileFormat == 'meme':
            self._convertFromMemeToGTrack(predictionFile, gtrackFile)
        elif fileFormat == 'glimmer':
            self._convertFromGlimmerToGTrack(predictionFile, gtrackFile)
        elif fileFormat == 'prodigal':
            self._convertFromProdigalToGTrack(predictionFile, gtrackFile)
        elif fileFormat == 'genemark':
            self._convertFromGenemarkToGTrack(predictionFile, gtrackFile)
        elif fileFormat == 'blasthit':
            self._convertFromBlastToGTrack(predictionFile, gtrackFile)
        elif fileFormat == 'ymf':
            fastaFile = open(fastaFilePath, 'r')
            self._convertFromYMFToGTrack(fastaFile, predictionFile, gtrackFile)
        elif fileFormat == 'gtrack' and normalizeValues == True:
            self._normalizeGTrackValues(filePath, gtrackFile)
        return out.getDiskPath(True)
 def handlePairDistance(self, genome, tracks, track_names, clusterMethod, extra_option):
     from gold.application.RSetup import r
     if self.params.has_key("pair_feature") : # must use "" here because the '' does not work
         feature = self.params.get('pair_feature')
         extra_feature = self.params.get('pair_feature+') #must be different from the text --select--
         d_matrix = self.constructDistMatrix(genome, tracks, feature, extra_feature)
         figure = GalaxyRunSpecificFile(['cluster_trakcs_result_figure.pdf'], self.jobFile) #this figure is runspecific and is put in the directory
         
         figurepath = figure.getDiskPath(True)
         r.pdf(figurepath, 8, 8)
         r.assign('track_names',track_names)
         r.assign('d_matrix', d_matrix)
         r('row.names(d_matrix) <- track_names')
         
         r('d <- as.dist(d_matrix)')
         if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--" :
            r.assign('extra_option',extra_option) 
            r('hr <- hclust(d, method=extra_option, members=NULL)')
            r('plot(hr, ylab="Distance", hang=-1)')
            
         r('dev.off()')
         print figure.getLink('clustering results figure<br>')
         
     else :
         print 'A feature must be selected in order to compute the distance between tracks.'
 def _createTrackFromDownloadedFile(self, filepath):
     
     trackName = ''
     filename = filepath.split('/')[-1]
     galaxyFile = GalaxyRunSpecificFile([filename], self._galaxyFn)
     
     # Find the file type of downloaded file
     fileType = self._findFileType(filepath)
     
     if fileType == None:
         # If invalid file type, raise an exception
         raise Exception('%s is not in a valid format' % filepath)
     elif fileType == 'directory':
         
         trackName = None
     elif fileType == 'tar': 
         # If its a zip file, extract it
         os.system('mkdir %s' % galaxyFile.getDiskPath(True).split('.')[0])
         trackName = self._extractZipFile(filepath)
     else: 
         # If its a valid trackName file, copy it and create a trackName
         os.system('cp %s %s' % (filepath, galaxyFile.getDiskPath(True)))
         trackName = 'galaxy:%s:%s:None' % (fileType, galaxyFile.getDiskPath())
     
     return trackName
 def convertToGTrack(self, trackName, regionTrackName=None, gtconverter=None, normalizeValues=False):
     
     trackData = trackName.split(':')
     fileFormat = trackData[1]
     
     if gtconverter == None:
         gtconverter = GTrackConverter()
     
     
     
     if not fileFormat == 'gtrack' or normalizeValues == True: 
         # If the file is in a format which requires the original fasta sequence
         if fileFormat == 'ymf':
             # First retrieve the fasta file, then go ahead and convert to GTrack
             fastaFile = GalaxyRunSpecificFile(['tmp.fasta'], self._galaxyFn)
             self.retrieveTrack(regionTrackName, fastaFile.getDiskPath(True))
             
             trackData[2] = gtconverter.convertToGTrack(trackData[2], 
                             fileFormat, self._galaxyFn, fastaFilePath=fastaFile.getDiskPath())
             
             trackData[1] = 'gtrack'
         else:
             trackData[2] = gtconverter.convertToGTrack(trackData[2], 
                             fileFormat, self._galaxyFn, normalizeValues=normalizeValues)
             
             trackData[1] = 'gtrack'
             
         trackName = '%s:%s:%s:%s' % (trackData[0], trackData[1], trackData[2], trackData[3])
         
     return trackName, fileFormat
 def execute(choices, galaxyFn=None, username=''):
     '''Is called when execute-button is pushed by web-user.
     Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
     If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
     choices is a list of selections made by web-user in each options box.
     '''
     if choices[2]=='Transfac TF ids':
         mappingFn = 'pwm2TFids.shelf'
         mapping = safeshelve.open(Tool1.MAPPING_SHELVES_PATH + os.sep + mappingFn )
     elif choices[2]== 'Transfac TF readable names':
         mappingFn = 'pwm2TFnamesNew.shelf'
         mapping = safeshelve.open(Tool1.MAPPING_SHELVES_PATH + os.sep + mappingFn )
     elif choices[2]== 'HGNC gene symbols':
         mappingFn = 'PWM_to_HGNC.txt'
         mapping = dict([line.strip().split() for line in open(Tool1.MAPPING_SHELVES_PATH + os.sep + mappingFn).readlines()])
     else:
         raise Exception(choices[2])
         
     if galaxyFn==None:
         for key in sorted(mapping.keys()):
             print key + ':' + ','.join(mapping[key]) + os.linesep,
     else:
         mappingStaticFile = GalaxyRunSpecificFile(['mapping.txt'], galaxyFn)
         f = mappingStaticFile.getFile()
         for key in sorted(mapping.keys()):
             if type(mapping[key]) in (list,tuple):
                 mapping[key] = ','.join(mapping[key])
             f.write( key + ':' + mapping[key] + os.linesep )
         f.close()
         print mappingStaticFile.getLink('View/download mapping')
 def collectParamsIntoFile(self):        
     parameters = GalaxyRunSpecificFile(['run_parameters.html'],self.jobFile) #just collect the parametes used into a file
     p_path = parameters.getDiskPath(True)
     p_output = open(p_path,'w')
     print>>p_output, '<html><body>'
     print>>p_output, '<ol>'
     for key in self.params.keys():
         print>>p_output, '<li>%s:%s </li>'%(key,self.params[key])
     print>>p_output, '</body></html>'
     p_output.close()
     print parameters.getLink('Parameters of this run')
 def storePickledResults(self):
     try:
         from cPickle import dump
         pickleStaticFile = GalaxyRunSpecificFile(['results.pickle'],self._galaxyFn)
         #print 'TEMP1: PATH: ',pickleStaticFile.getDiskPath(True)
         from copy import copy
         pickleList = [copy(res) for res in self._resultsList]
         for res in pickleList:
             res._analysis=None
         dump(pickleList, pickleStaticFile.getFile())
         #dump(self._resultsList, pickleStaticFile.getFile())
     except Exception, e:
         logException(e, message='Not able to pickle results object')
 def retrieveBenchmarkSuiteAsZipFile(self, trackNames):
     zipFile = GalaxyRunSpecificFile(['BenchmarkSuite.tar.gz'], self._galaxyFn)
     path = zipFile.getDiskPath(True)
     path =  path[0:-len(path.split('/')[-1])]
         
     # For every trackName, retrieve the trackName and copy it to a directory
     for trackName in trackNames:
         filePath = trackName.split(':')[2].split('/')
         fileName = filePath[len(filePath)-1]
         fastaFileName = GalaxyRunSpecificFile(['BenchmarkSuite/%s.fasta' % fileName.split('.')[0]], self._galaxyFn)
         self.retrieveTrack(trackName, fastaFileName.getDiskPath(True))
         
     # And finally create a zip file, and return a link pointing to it
     os.system('tar -P -czvf %sBenchmarkSuite.tar.gz %sBenchmarkSuite/' % (path, path))
         
     return zipFile.getLink("Download benchmark suite")
 def _createTrackFromFileName(self, filename):
     filedata = filename.split('.')
     trackName = ''
     galaxyFile = GalaxyRunSpecificFile([filename], self._galaxyFn)
     
     currentPath = '%s/%s' % (os.getcwd(), filename)
     
     if len(filedata) == 3 and filedata[1] == 'tar' and filedata[2] == 'gz':
         os.system('mkdir %s' % galaxyFile.getDiskPath(True).split('.')[0])
         trackName = self._extractZipFile(filename)
     elif len(filedata) == 2 and filedata[1] == 'gtrack':
         os.system('cp %s %s' % (currentPath, galaxyFile.getDiskPath(True)))
         trackName = 'galaxy:gtrack:%s:None' % galaxyFile.getDiskPath(True)
     else:
         raise Exception('%s is not in a valid format' % filename)
     
     return trackName
    def execute(cls, choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''
        genome = choices[0]
        nmer = choices[1].lower()
        regSpec = choices[2]
        analysisRegions = parseRegSpec(regSpec, genome)
        
        binSize = cls._calcBinSize(nmer, analysisRegions)
        binSpec = '*' if binSize is None else str( binSize ) 
        numBins = len( AutoBinner(analysisRegions, binSize) )
        
        from quick.application.GalaxyInterface import GalaxyInterface
        from quick.util.GenomeInfo import GenomeInfo
        trackName1 = GenomeInfo.getPropertyTrackName(genome, 'nmer') + [str(len(nmer))+'-mers',nmer]
        trackName2 = ['']
        analysisDef = 'Counts: The number of track1-points -> CountPointStat'
        #regSpec = '*'
        #print 'Using binSize: ',binSpec
        #print 'TN1: ',trackName1
        from gold.result.HtmlCore import HtmlCore
        print str(HtmlCore().styleInfoBegin(styleClass='debug'))
        GalaxyInterface.run(trackName1, trackName2, analysisDef, regSpec, binSpec, genome, galaxyFn)
        print str(HtmlCore().styleInfoEnd())

        plotFileNamer = GalaxyRunSpecificFile(['0','CountPointStat_Result_gwplot.pdf'], galaxyFn)
        textualDataFileNamer = GalaxyRunSpecificFile(['0','CountPointStat_Result.bedgraph'], galaxyFn)
        
        core = HtmlCore()
        core.paragraph('Inspect nmer frequency variation as a %s or as underlying %s.</p>' % ( plotFileNamer.getLink('plot'), textualDataFileNamer.getLink('textual data') ))
        core.divider()
        core.paragraph('The occurrence frequency of your specified nmer ("%s") has been computed along the genome, within your specified analysis region ("%s").' % (nmer, regSpec))
        core.paragraph('The analysis region was divided into %i bins, based on calculations trying to find appropriate bin size (get enough data per bin and restrict maximum number of bins).' % numBins)
        
        trackName1modified = trackName1[0:-2] + trackName1[-1:]
        preSelectedAnalysisUrl = createHyperBrowserURL(genome, trackName1modified,[''], analysis='Counts',method='auto',region=regSpec, binsize=binSpec)
        core.divider()
        core.paragraph('If you do not find the inferred bin size to be appropriate, you can set this manually in a ' + str(HtmlCore().link('new analysis', preSelectedAnalysisUrl)) + '.')
        print str(core)
    def printClusterPlots(cls, correlationMatrix, linkageMatrix, galaxyFn, distanceMeasure, labels, htmlCore):
        from numpy import amax, amin, isnan
        maxVal = amax(correlationMatrix)
        minVal = amin(correlationMatrix)

        seabornFile = GalaxyRunSpecificFile(['Image', distanceMeasure + 'seabornHeatmap.pdf'], galaxyFn)
        dendrogramFile = GalaxyRunSpecificFile(['Image', distanceMeasure + 'dendrogram.pdf'], galaxyFn)

        if minVal < 0 or isnan(minVal):
            MatplotlibPlots.seabornHeatmapPlot(
                correlationMatrix,
                labels,
                max=maxVal if maxVal >= 1 else 1,
                min=minVal if minVal <= -1 else -1,
                fileName=seabornFile,
                cmap="RdBu_r"
            )
        else:
            MatplotlibPlots.seabornHeatmapPlot(
                correlationMatrix,
                labels,
                max=maxVal if maxVal >= 1 else 1,
                min=minVal if minVal <= 0 else 0,
                fileName=seabornFile
            )

        MatplotlibPlots.dendrogramClusteringPlot(linkageMatrix, labels, dendrogramFile)
        htmlCore.line(seabornFile.getEmbeddedImage())
        htmlCore.link('PDF of similarity matrix', seabornFile.getURL())
        htmlCore.line(dendrogramFile.getEmbeddedImage())
        htmlCore.link('PDF of dendrogram', dendrogramFile.getURL())
Exemple #13
0
    def analyzeNumRejectedDistribution(maxNumSamples,h, fdrThreshold, totalNumTests, totalNumH1Tests, numReplications,a,b, galaxyFn=None):
        numRej = []
        texts = []

        #estimate time use:
        print '(estimating run time..)'
        prevTime= time.time()
        #Experiment._analyzeNumRejectedDistribution(maxNumSamples, None, None, fdrThreshold, totalNumTests, totalNumH1Tests, 1,a,b, galaxyFn)
        Experiment._analyzeNumRejectedDistribution(maxNumSamples, None, None, fdrThreshold, 1, 1, 1,a,b, galaxyFn)
        baseMeasure = time.time() - prevTime
        withOnlyMaxNumEstimate = baseMeasure * totalNumTests * numReplications
        #print 'Estimated running time: between %i and %i seconds.' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate*3)
        print 'Estimated running time: around %i seconds. (%.1f hours)' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate/3600.0)


        for x,y,z,simult,text in [ [maxNumSamples, None, None,True,'Basic'], [maxNumSamples, h, None,True,'Sequential'], [maxNumSamples, h, fdrThreshold,True,'McFdr Simultanous'], [maxNumSamples, h, fdrThreshold,False,'McFdr Individual']]:
            print text, ':'
            MultipleTestCollection.SIMULTANOUS_FDR_STOPPING_CRITERION = simult
            numRej.append( Experiment._analyzeNumRejectedDistribution(x,y,z, fdrThreshold, totalNumTests, totalNumH1Tests, numReplications,a,b, galaxyFn) )
            texts.append(text)
        plotStaticFile = GalaxyRunSpecificFile(['numRej.png'],galaxyFn)
        plotStaticFile.plotRLines(range(len(numRej[0])), numRej, xlab='Sorted simulations', ylab='num Rejected', legend=texts)
        print plotStaticFile.getLink('Cumulative distribution')
 def executePairDistance(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, extra_feature, galaxyFn, regSpec, binSpec):
     from gold.application.RSetup import r
     #jobFile = open(galaxyFn, 'w')
     jobFile = galaxyFn
     if feature is not None: # must use "" here because the '' does not work
         
         l = len(tracks)
         d_matrix = zeros((l,l))
         for i in range(l) :
             for j in range(l):
                 if i < j :
                     if extra_feature == "1 minus the ratio" :
                         d_matrix[i,j] = 1 - ClusteringExecution.computeDistance(genome, tracks[i], tracks[j], feature, regSpec, binSpec)
                         d_matrix[j,i] = d_matrix[i,j]
                     elif extra_feature == "1 over the ratio" :
                         d_matrix[i,j] = 1/ClusteringExecution.computeDistance(genome, tracks[i], tracks[j], feature, regSpec, binSpec)
                         d_matrix[j,i] = d_matrix[i,j]
                     else :
                         d_matrix[i,j] = ClusteringExecution.computeDistance(genome, tracks[i], tracks[j], feature, regSpec, binSpec)
                         d_matrix[j,i] = d_matrix[i,j] 
         
         figure = GalaxyRunSpecificFile(['cluster_trakcs_result_figure.pdf'], jobFile) #this figure is runspecific and is put in the directory
         
         figurepath = figure.getDiskPath(True)
         r.pdf(figurepath, 8, 8)
         r.assign('track_names',track_names)
         r.assign('d_matrix', d_matrix)
         r('row.names(d_matrix) <- track_names')
         
         r('d <- as.dist(d_matrix)')
         if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--" :
            r.assign('extra_option',extra_option) 
            r('hr <- hclust(d, method=extra_option, members=NULL)')
            r('plot(hr, ylab="Distance", hang=-1)')
            
         r('dev.off()')
         print figure.getLink('clustering results figure<br>')
    def getLDDistancesOfMultipleRsquares(cls, ldGraphTrack, rSquareThresholds, galaxyFn, htmlCore):

        graph = LDExpansions.createRSquareGraph(ldGraphTrack, 0)
        positions = LDExpansions.createPositionDict(ldGraphTrack)
        ldDistances = []
        rSquareLabels = []
        bins = []

        for rSquare, isSet in rSquareThresholds.items():
            if not isSet:
                continue

            rSquareLabels.append(rSquare)
            rSquare = float(rSquare)
            distances = cls.findAllDistancesInLD(graph, positions, rSquare, htmlCore)
            bins, ldPairCount = cls.standardizeLineGraph(distances)
            ldDistances.append(ldPairCount)

        graphFile = GalaxyRunSpecificFile(['multipleLines.pdf'], galaxyFn)

        MatplotlibPlots.multipleLineGraph(
            bins[1:],
            ldDistances,
            rSquareLabels,
            graphFile,
            'Distance',
            'LD-pair count'
        )

        if len(bins) == 0:
            return

        htmlCore.divider(True)
        htmlCore.header('Distribution of distances between LD pairs with different thresholds of r<sup>2</sup>')
        htmlCore.line(graphFile.getEmbeddedImage())
        htmlCore.link('PDF of distances between tracks here', graphFile.getURL())
    def execute(choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''
        from time import time
        startTime = time()
        from quick.application.ExternalTrackManager import ExternalTrackManager
        from quick.util.StaticFile import GalaxyRunSpecificFile
        import os

        motifFn = ExternalTrackManager.extractFnFromGalaxyTN( choices[0].split(':'))
        observedFasta = ExternalTrackManager.extractFnFromGalaxyTN( choices[1].split(':'))

        randomGalaxyTN = choices[2].split(':')
        randomName = ExternalTrackManager.extractNameFromHistoryTN(randomGalaxyTN)
        randomGalaxyFn = ExternalTrackManager.extractFnFromGalaxyTN( randomGalaxyTN)
        randomStatic = GalaxyRunSpecificFile(['random'],randomGalaxyFn) #finds path to static file created for a previous history element (randomFn), and directs to a folder containing several files..
        #print os.listdir(randomStatic.getDiskPath())
        randomFastaPath = randomStatic.getDiskPath()

        #motifFn, observedFasta, randomFastaPath = '/Users/sandve/egne_dokumenter/_faglig/NullModels/DnaSeqExample/liver.pwm', 'liver.fa', 'randomFastas'
        testStatistic = choices[3]
        if testStatistic == 'Average of max score per sequence':
            scoreFunc = scoreMotifOnFastaAsAvgOfBestScores
        elif testStatistic == 'Sum of scores across all positions of all sequences':
            scoreFunc = scoreMotifOnFastaAsSumOfAllScores
        elif testStatistic == 'Score of Frith et al. (2004)':
            scoreFunc = lr4
        elif testStatistic == 'Product of max per sequence':
            scoreFunc = scoreMotifOnFastaAsProductOfBestScores
        else:
            raise
        
        pvals = mcPvalFromMotifAndFastas(motifFn, observedFasta, randomFastaPath, scoreFunc)
        print 'Pvals for motifs (%s) against observed (%s) vs random (%s - %s) sequences.' % (motifFn, observedFasta, randomName, randomFastaPath)
        for motif,pval in sorted(pvals.items()):
            print motif+'\t'+('%.4f'%pval)
            
        from quick.util.StaticFile import GalaxyRunSpecificFile
        from gold.application.RSetup import r, robjects
        histStaticFile = GalaxyRunSpecificFile(['pvalHist.png'],galaxyFn)
        #histStaticFile.openRFigure()
        histStaticFile.plotRHist(pvals.values(), [x/40.0 for x in range(41)], 'Histogram of p-values', xlim=robjects.FloatVector([0.0, 1.0]))
        #r.hist(robjects.FloatVector(pvals.values()), breaks=robjects.FloatVector([x/40.0 for x in range(41)]), xlim=robjects.FloatVector([0.0, 1.0]), main='Histogram of p-values' )
        #histStaticFile.closeRFigure()
        print histStaticFile.getLink('Histogram')
        print 'Time (s):', time()-startTime
    def plotDistances(cls, distances, galaxyFn, distCase, htmlCore):

        # Plot distance graph
        xdata, ydata = cls.standardizeLineGraph(distances)
        distFile = GalaxyRunSpecificFile(['distancegraph.pdf'], galaxyFn)
        MatplotlibPlots.pointGraph(xdata[1:], ydata, distFile, 'Smallest distance for each point',
                                   'Distance point count')
        # Write distance graph
        htmlCore.divider(True)
        htmlCore.header('Graph of smallest distances for all points ' + distCase + ' tracks in GSuite')
        htmlCore.line(distFile.getEmbeddedImage())
        htmlCore.link('PDF of distance graph', distFile.getURL())

        # Plot distance histograms
        dist = sorted(distances)
        bins = 20
        histFile = GalaxyRunSpecificFile(['histogram.pdf'], galaxyFn)
        loghistFile = GalaxyRunSpecificFile(['loghistogram.pdf'], galaxyFn)
        MatplotlibPlots.histogramRugPlot(dist, bins, histFile, 'Distances')
        MatplotlibPlots.histogramRugPlot(log(dist), bins, loghistFile, 'Log of distances')
        helperText = 'The rugs/vertical lines at the bottom show the distribution of point distances.'

        # Write distance histograms
        htmlCore.divider(True)
        htmlCore.header('Histogram of smallest distances for all points ' + distCase +
                        ' tracks in GSuite')
        htmlCore.line(helperText)
        htmlCore.line(histFile.getEmbeddedImage())
        htmlCore.link('PDF of distance histogram', histFile.getURL())

        htmlCore.header('Histogram of log of smallest distances for all points ' + distCase +
                        ' tracks in GSuite')
        htmlCore.line(helperText)
        htmlCore.line(loghistFile.getEmbeddedImage())
        htmlCore.link('PDF of log distance histogram', loghistFile.getURL())

        # Plot and write interactive bar chart
        cls.getInteractiveColumnChart(dist, distCase, htmlCore)
    def printTextMatrixes(cls, correlationMatrix, linkageMatrix, distanceMatrix, galaxyFn, filename, htmlCore):

        # Print correlation matrix
        corrMatrixFile = GalaxyRunSpecificFile(['corr_matrix_result_' + filename + '.txt'], galaxyFn)
        corrMatrixPath = corrMatrixFile.getDiskPath(True)
        open(corrMatrixPath, 'w').write(str(correlationMatrix))
        htmlCore.link('<br><br>View the raw text similarity/correlation matrix for this analysis',
                      corrMatrixFile.getURL())

        # Print distance matrix
        distMatrixFile = GalaxyRunSpecificFile(['dist_matrix_result_' + filename + '.txt'], galaxyFn)
        distMatrixPath = distMatrixFile.getDiskPath(True)
        open(distMatrixPath, 'w').write(str(distanceMatrix))
        htmlCore.link('<br><br>View the raw text triangular distance matrix for this analysis', distMatrixFile.getURL())

        # Print linkage matrix
        linkMatrixFile = GalaxyRunSpecificFile(['linkage_matrix_result_' + filename + '.txt'], galaxyFn)
        linkMatrixPath = linkMatrixFile.getDiskPath(True)
        open(linkMatrixPath, 'w').write(str(linkageMatrix))
        htmlCore.link('<br><br>View the raw text linkage matrix for this analysis', linkMatrixFile.getURL())
    def plotDistances(cls, distances, galaxyFn, bins, r2, htmlCore):
        distanceBins, occurrences = cls.standardizeLineGraph(distances)
        distFile = GalaxyRunSpecificFile(['distancegraph.pdf'], galaxyFn)
        MatplotlibPlots.pointGraph(x=distanceBins[1:], y=occurrences, fileLocation=distFile,
                                   xlabel='Distances between SNP pairs in LD', ylabel='LD-pair count')

        dist = sorted(distances)
        histFile = GalaxyRunSpecificFile(['histogram.pdf'], galaxyFn)
        MatplotlibPlots.histogramPlot(dist, bins, histFile, 'Distances between SNP pairs in LD')

        htmlCore.divider(True)
        htmlCore.header('Line plot with distribution of distances between LD pairs, r<sup>2</sup>  >= ' + str(r2))
        htmlCore.line(distFile.getEmbeddedImage())
        htmlCore.link('PDF of distances between tracks here', distFile.getURL())

        htmlCore.divider(True)
        htmlCore.header('Histogram with distribution of distances between LD pairs, r<sup>2</sup> >= ' + str(r2))
        htmlCore.line(histFile.getEmbeddedImage())
        htmlCore.link('PDF histogram of distances here', histFile.getURL())
    def getSNPFrequencyStats(cls, bins, gSuite, galaxyFn, htmlCore):
        rsIDs = set()
        snpCount = []
        analysisSpec = AnalysisSpec(UniquePointTrackStat)
        trackLabels = []

        for gSuiteTrack in gSuite.allTracks():
            track = Track(gSuiteTrack.trackName)
            trackLabels.append(gSuiteTrack.title)
            result = doAnalysis(analysisSpec, bins, [track])
            if 'Result' in result.getGlobalResult():
                observations = result.getGlobalResult()['Result']
                snpCount.append(len(observations))
                rsIDs.update(observations)

        snpcountFile = GalaxyRunSpecificFile(['snpfrequencies.pdf'], galaxyFn)
        MatplotlibPlots.pointGraphY(snpCount, snpcountFile, ylabel='SNP counts',
                                    xticks=trackLabels)

        snpdistributionFile = GalaxyRunSpecificFile(['snpfreqhistogram.pdf'], galaxyFn)
        MatplotlibPlots.histogramRugPlot(snpCount, 10, snpdistributionFile, 'SNP counts')

        totalSNPCount = sum(snpCount)
        cls.printStats(snpCount, 'track', htmlCore)
        htmlCore.line('Total number of SNPs: ' + str(totalSNPCount))
        htmlCore.line('Unique SNPs: ' + str(len(rsIDs)))
        htmlCore.line('Overlapping rsIDs: ' + str(totalSNPCount - len(rsIDs)))

        htmlCore.divider(True)
        htmlCore.header('Graph of SNP frequencies in GSuite tracks')
        htmlCore.line(snpcountFile.getEmbeddedImage())
        htmlCore.link('PDF of SNP frequency graph', snpcountFile.getURL())

        htmlCore.divider(True)
        htmlCore.header('Histogram of SNP frequencies in GSuite tracks')
        htmlCore.line(snpdistributionFile.getEmbeddedImage())
        htmlCore.link('PDF of SNP frequency histogram', snpdistributionFile.getURL())

        cls.getInteractiveColumnChartWithLabels(snpCount, trackLabels, htmlCore)
    def findTFsOccurringInRegions(cls, genome, tfSource, regionsBedFn, upFlankSize, downFlankSize, galaxyFn):
        uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn))
        #assert genome == 'hg18' #other genomes not supported. TF id links do not specify genome for pre-selection of analysis
        
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        assert tfTrackNameMappings != {}, 'No TF info for genome: %s' % genome
        
        tfTrackName = tfTrackNameMappings[tfSource]
                
        if (upFlankSize == downFlankSize == 0):
            flankedRegionsFn = regionsBedFn
        else:
            flankedRegionsFn= uniqueWebPath + os.sep + 'flankedRegs.bed'
            GalaxyInterface.expandBedSegments(regionsBedFn, flankedRegionsFn, genome, upFlankSize, downFlankSize)

        regSpec, binSpec = 'bed', flankedRegionsFn
        res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName)

        tfNames = res.getResDictKeys()
        #print 'RES: ', res.getGlobalResult()[tfNames[0]], type(res.getGlobalResult()[tfNames[0]])
        import third_party.safeshelve as safeshelve
        pwm2tfids = safeshelve.open(os.sep.join([HB_SOURCE_CODE_BASE_DIR,'data','pwm2TFids.shelf']), 'r')
        tf2class = safeshelve.open(os.sep.join([HB_SOURCE_CODE_BASE_DIR,'data','TfId2Class.shelf']), 'r')
        pwmName2id= safeshelve.open(os.sep.join([HB_SOURCE_CODE_BASE_DIR,'data','pwmName2id.shelf']), 'r')
        #print tfNames[0],tfNames[1], ' VS ', pwm2tfids.keys()[0], len(pwm2tfids)
        #tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits (class %s))'%(tf, res.getGlobalResult()[tf]), '/'.join([tf2class[x] for x in pwm2tfids[tf]]) ) for tf in tfNames]))) #num hits, tfName, tfTextInclHits
        tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits )'%(tf, res.getGlobalResult()[tf]) + \
                                     (' (class: %s)'%'/'.join(set([str(tf2class.get(x)) for x in pwm2tfids[pwmName2id[tf]] if x in tf2class]))\
                                      if (tf in pwmName2id and pwmName2id[tf] in pwm2tfids and any([x in tf2class for x in pwm2tfids[pwmName2id[tf]]]))\
                                    else '') ) \
                                    for tf in tfNames])) ) #num hits, tfName, tfTextInclHits
        
        tfsPlural = 's' if len(tfs)!=1 else ''
        print '<p>There are %i TF%s targeting your regions of interest, using "%s" as source of TF occurrences.</p>' % (len(tfs), tfsPlural, tfSource)
        
        expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else ''                

        idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'],galaxyFn)
        idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( quote(':'.join(tfTrackName+[tf[1]])), tf[2]) for tf in tfs]))
        print '<p>', idHtmlFileNamer.getLink('Inspect html file'), ' of all TF IDs occurring 1 or more times within your%s regions of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (expansionStr)

        idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'],galaxyFn)
        idFileNamer.writeTextToFile(os.linesep.join([tf[2] for tf in tfs]) + os.linesep)
        print '<p>', idFileNamer.getLink('Inspect text file'), ' listing all TF IDs occurring 1 or more times within your%s regions of interest.</p>' % (expansionStr)
    
        extractedTfbsFileNamer = GalaxyRunSpecificFile(['tfbsInGeneRegions.bed'],galaxyFn)
        GalaxyInterface.extractTrackManyBins(genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath(), True)
        print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink('Inspect bed-file'), 'of all TF binding sites occurring within your%s regions of interest.</p>' % (expansionStr)

        for dummy,tf,dummy2 in tfs:            
            extractedTfbsFileNamer = GalaxyRunSpecificFile([tf+'_tfbsInGeneRegions.bed'],galaxyFn)
            GalaxyInterface.extractTrackManyBins(genome, tfTrackName+[tf], regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath())
            print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink('Binding sites of the TF %s' %tf, 'bed'), 'occurring within your%s regions of interest (bed-file).</p>' % (expansionStr)
    def execute(choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''
        import subprocess
        import os
        from quick.util.StaticFile import GalaxyRunSpecificFile
        from config.Config import HB_SOURCE_CODE_BASE_DIR
        from quick.application.ExternalTrackManager import ExternalTrackManager
        
        tempInStaticFile = GalaxyRunSpecificFile(['tempIn.txt'], galaxyFn)
        outStaticFile = GalaxyRunSpecificFile(['tempOut.fasta'], galaxyFn)
        #print os.getcwd()
        inFn = ExternalTrackManager.extractFnFromGalaxyTN( choices[0].split(':') )
        #print inFn
        tempOutFn = outStaticFile.getDiskPath(True)
        #print tempOutFn
        os.chdir(HB_SOURCE_CODE_BASE_DIR + '/third_party/nonpython')
        #print outStaticFile.getLink('output')
        markovOrder = int(choices[1])

        seqs = []     
        for line in open(inFn):
            if line.startswith('>'):
                seqs.append( [line[1:].strip(),[]] )
            else:
                seqs[-1][1].append(line.strip())
        for seq in seqs:
            seq[1] = ''.join(seq[1])
            
        pureSequence = ''.join( [seq[1] for seq in seqs])
        totalSeqLen = len(pureSequence)
        #pureSequence = ''.join([line.replace('\n','') for line in open(inFn) if not line.startswith('>')])
        tempInStaticFile.writeTextToFile(pureSequence)
        numSamples = int(choices[2])
        
        if numSamples>1:
            zipOutStatic = GalaxyRunSpecificFile(['randomFastas.zip'], galaxyFn)                
            zipOut = zipfile.ZipFile(zipOutStatic.getDiskPath(True),'w')
            
        for iteration in range(numSamples):
            if numSamples>1:
                fastaOutStatic = GalaxyRunSpecificFile(['random','s%s.fa'%iteration], galaxyFn)
                fastaOutFn = fastaOutStatic.getDiskPath(True)
            else:
                fastaOutFn = galaxyFn
            #fastaOutStatic = GalaxyRunSpecificFile(['random%s'%iteration], galaxyFn)
            #subprocess.call('javac',shell=True)
            #subprocess.call('javac',shell=False)
            #subprocess.call('javac MarkovModel.java',shell=True)
            subprocess.call('java MarkovModel %s %s %s >%s' % (tempInStaticFile.getDiskPath(), markovOrder, totalSeqLen, tempOutFn), shell=True )
            #subprocess.call('javac third_party/nonpython/MarkovModel.java')
            #subprocess.call('java third_party/nonpython/MarkovModel.java')
            pureMarkovSequence = open(tempOutFn).readline().strip()
            pmsIndex = 0
            fastaOutF = open(fastaOutFn,'w')
            for seq in seqs:
                fastaOutF.write('>'+seq[0]+os.linesep)
                nextPmsIndex = pmsIndex+len(seq[1])
                #seq.append(pureMarkovSequence[pmsIndex:nextPmsIndex])
                fastaOutF.write( pureMarkovSequence[pmsIndex:nextPmsIndex] + os.linesep)
                pmsIndex = nextPmsIndex
            fastaOutF.close()
            assert pmsIndex == totalSeqLen == len(pureMarkovSequence), (pmsIndex, totalSeqLen , len(pureMarkovSequence))
            if numSamples>1:
                #print 'Adding %s to archive' % fastaOutFn.split('/')[-1]
                zipOut.write(fastaOutFn, fastaOutFn.split('/')[-1])

        if numSamples>1:
            zipOut.close()
            print zipOutStatic.getLink('Zipped random sequences')
    def execute(cls, choices, galaxyFn=None, username=""):

        from quick.application.GalaxyInterface import GalaxyInterface

        fileformat = choices[6]
        outputFile = open(galaxyFn, "w")

        if fileformat == "html":
            print GalaxyInterface.getHtmlBeginForRuns(galaxyFn)
            print GalaxyInterface.getHtmlForToggles(withRunDescription=False)
            t = calendar.timegm(time.gmtime())
            htmlfile = GalaxyRunSpecificFile(["fet", str(t)], galaxyFn)

        genome = choices[0]
        track1 = choices[1].split(":")
        track2 = choices[2].split(":")
        tn1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, track1)
        tn2 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, track2)

        windowSize = int(choices[3])
        windowStep = int(choices[4])
        percentile = float(choices[5])

        # results = {}

        # TODO: why this?
        # tr = Track(tn1)
        # tr.addFormatReq(TrackFormatReq(dense=False, allowOverlaps=True))

        outputFile.write("#seqid\tstart\tscore\tstddev\n")

        if fileformat == "html":
            text = "#seqid\tstart\tscore\tstddev\n"
        print "chrs:", str(GenomeInfo.getChrList(genome))
        reg = "*"
        bins = "*"
        analysisDef = "Dummy: dummy name ([wStep=%g] [wSize=%g] [percentile=%g])-> FisherExactScoreStat" % (
            windowStep,
            windowSize,
            percentile,
        )
        userBinSource = GalaxyInterface._getUserBinSource(reg, bins, genome)
        result = GalaxyInterface.runManual([tn1, tn2], analysisDef, reg, bins, genome, galaxyFn=galaxyFn)
        for key in result.getAllRegionKeys():
            chrom = str(key).split(":")[0]
            r = result[key]
            if "Result" not in r.keys():
                print "skipping chr:", chrom, r
                continue
            r = r["Result"]
            scores = r[0]
            stddev = r[1]
            for i in range(len(scores)):
                if scores[i] != 0:
                    pos = i * windowStep
                    # if choices[5] == "html":
                    # print "%s\t%s\t%s\t%s\n" % (str(chrom), pos, str(scores[i]), str(stddev[i]))
                    if fileformat == "tabular":
                        outputFile.write("%s\t%s\t%s\t%s\n" % (str(chrom), pos, str(scores[i]), str(stddev[i])))
                    else:
                        text += "%s\t%s\t%s\t%s\n" % (str(chrom), pos, str(scores[i]), str(stddev[i]))

        if fileformat == "html":
            htmlfile.writeTextToFile(text)
            print htmlfile.getLink("Result file")
            print GalaxyInterface.getHtmlEndForRuns()

        outputFile.close()
Exemple #24
0
    def compareCutoffSchemes(maxNumSamples, h, fdrThreshold, totalNumTests, stepSize, numReplications,a,b, galaxyFn=None):
        print '<PRE>'
        print 'Comparing cutoff schemes with parameters: maxNumSamples=%i, h=%i, fdrThreshold=%.2f, totalNumTests=%i, numReplications=%i' % (maxNumSamples, h, fdrThreshold, totalNumTests, numReplications)
        print 'stepSize: ',stepSize
        print 'H1 p-values drawn from beta with a=%.3f and b=%.3f' % (a,b)
        print 'Minimum achieveable p-value is %.5f, which gives minimum Bonferroni-corrected p-value of %.5f (compares to a fdr threshold of %.2f)' % (1.0/maxNumSamples, (1.0/maxNumSamples)*totalNumTests, fdrThreshold)
        
        #estimate time use:
        prevTime= time.time()
        Simulator(maxNumSamples, None, None,a,b,fdrThreshold).numSamplesAsFunctionOfNumH1( 1, 1, 1)
        baseMeasure = time.time() - prevTime
        if type(stepSize)==int:
            numSteps = len(range(0,totalNumTests+1,stepSize))
        elif type(stepSize)==list:
            numSteps = len(stepSize)
        withOnlyMaxNumEstimate = baseMeasure * totalNumTests * numSteps * numReplications
        #print 'Estimated running time: between %i and %i seconds.' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate*3)
        print 'Estimated running time: around %i seconds. (%.1f hours)' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate/3600.0)
        
        sortedKeys, onlyMaxCutoff, onlyMaxNumRejected, onlyMaxType1Errors, onlyMaxType2Errors = Simulator(maxNumSamples, None, None,a,b,fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1( totalNumTests, stepSize, numReplications)
        sortedKeys, seqMcCutoff, seqMcNumRejected, seqMcType1Errors, seqMcType2Errors  = Simulator(maxNumSamples, h, None,a,b,fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications)
        sortedKeys, mcFdrCutoff, mcFdrNumRejected, mcFdrType1Errors, mcFdrType2Errors  = Simulator(None, h, fdrThreshold,a,b,fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications)
        maxY = max( max(s) for s in [onlyMaxCutoff, seqMcCutoff, mcFdrCutoff])
        #minY = min( min(s) for s in [onlyMaxCutoff, seqMcCutoff, McFdrCutoff])
        minY=0

        print 'Time spent: ',time.time() - prevTime, ' secs'
        print '</PRE>'
        
        #plotStaticFile.getDiskPath(True)
        if galaxyFn is not None:
            #print 'Generating aggregate McFdr simulation figures'
            plotStaticFile = GalaxyRunSpecificFile(['mainPlot.png'],galaxyFn)
            if type(stepSize) is int:
                allNumH1s = range(0,totalNumTests+1,stepSize)
            elif type(stepSize) is list:
                allNumH1s = stepSize
            for numH1 in allNumH1s:
                catalogStaticFile = GalaxyRunSpecificFile([str(numH1),'cat.html'], galaxyFn)
                print catalogStaticFile.getLink( 'Tests with #True H1s=%i' % numH1 ), '<br>'

            #plotStaticFile.openRFigure()
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxCutoff), ylim=r.unlist([minY,maxY]), type='l', xlab='Number of true H1s', ylab='Total MC samples' , col='black')
            #r.lines(r.unlist(sortedKeys), r.unlist(seqMcCutoff), col='red' )
            #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrCutoff), col='green' )
            #r.legend('topleft',['BasicMc','SeqMc','McFdr'],col=['black','red','green'],lty=1)
            plotStaticFile.plotRLines(sortedKeys, [onlyMaxCutoff,seqMcCutoff,mcFdrCutoff], xlab='Number of true H1s', ylab='Total MC samples', legend=['BasicMc','SeqMc','McFdr'])
            #r('dev.off()')
            #plotStaticFile.closeRFigure()

            print plotStaticFile.getLink('View main plot') + ' of sumSamples as function of #H1s.', '<br>'

            numRejectedPlotStaticFile = GalaxyRunSpecificFile(['secondaryPlot.png'],galaxyFn)
            numRejectedPlotStaticFile.plotRLines(sortedKeys, [onlyMaxNumRejected,seqMcNumRejected,mcFdrNumRejected],xlab='Number of true H1s', ylab='Num rejected tests',legend=['BasicMc','SeqMc','McFdr'])
            #numRejectedPlotStaticFile.openRFigure()
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxNumRejected), ylim=r.unlist([0,totalNumTests]), type='l', xlab='Number of true H1s', ylab='Num rejected tests',col='black' )
            #r.lines(r.unlist(sortedKeys), r.unlist(seqMcNumRejected), col='red' )
            #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrNumRejected), col='green' )
            #r.lines(r.unlist(sortedKeys), r.unlist(sortedKeys), col='black', lty='dotted' ) #As this corresponds to perfect estimation..
            #r.legend('topleft',['BasicMc','SeqMc','McFdr','NumFromH1'],col=['black','red','green','black'],lty=[1,1,1,2])
            #r('dev.off()')
            #numRejectedPlotStaticFile.closeRFigure()
            print numRejectedPlotStaticFile.getLink('View secondary plot') + ' of #true H1s vs #tests rejected.', '<br>'

            #Classification errors
            classificationErrorPlotStaticFile = GalaxyRunSpecificFile(['errors.png'],galaxyFn)
            classificationErrorPlotStaticFile.openRFigure()
            yMax = max( max(x) for x in [mcFdrType2Errors,mcFdrType1Errors,seqMcType2Errors,seqMcType1Errors,onlyMaxType2Errors,onlyMaxType1Errors ])
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxType1Errors), ylim=r.unlist([0,yMax]), type='l', xlab='Number of true H1s', ylab='Type 1/2 errors',col='black' )
            r.lines(r.unlist(sortedKeys), r.unlist(onlyMaxType2Errors), col='black', lty='dotted' )
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcType1Errors), col='red' )
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcType2Errors), col='red', lty='dotted' )
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrType1Errors), col='green' )
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrType2Errors), col='green', lty='dotted' )
            rpy1.legend('topleft',['BasicMcType1','SeqMcType1','McFdrType1','BasicMcType2','SeqMcType2','McFdrType2'],col=['black','red','green','black','red','green'],lty=[1,1,1,2,2,2])
            #r('dev.off()')
            classificationErrorPlotStaticFile.closeRFigure()
            print classificationErrorPlotStaticFile.getLink('View Type 1/2 error plot') + ' as function of number of true H1.', '<br>'

            #Classification errors
            onlyMaxAccuracy = [ sum(errors)*1.0/totalNumTests for errors in zip(onlyMaxType1Errors, onlyMaxType2Errors)]
            seqMcAccuracy = [ sum(errors)*1.0/totalNumTests for errors in zip(seqMcType1Errors, seqMcType2Errors)]
            mcFdrAccuracy = [ sum(errors)*1.0/totalNumTests for errors in zip(mcFdrType1Errors, mcFdrType2Errors)]
            
            accuracyPlotStaticFile = GalaxyRunSpecificFile(['accuracy.png'],galaxyFn)
            accuracyPlotStaticFile.openRFigure()
            yMax = 0.2 #just set ad hoc here..
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxAccuracy), ylim=r.unlist([0,yMax]), type='l', xlab='Number of true H1s', ylab='Accuracy',col='black' )
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcAccuracy), col='red' )
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrAccuracy), col='green' )
            rpy1.legend('topleft',['BasicMc','SeqMc','McFdr','NumFromH1'],col=['black','red','green'],lty=[1,1,1])
            #r('dev.off()')
            accuracyPlotStaticFile.closeRFigure()
            print accuracyPlotStaticFile.getLink('View accuracy plot') + ' as function of number of true H1.', '<br>'
                        
            #False positive rates
            onlyMaxFpr= [ float(fp)/pos if pos!=0 else 0 for fp,pos in zip(onlyMaxType1Errors, onlyMaxNumRejected)]
            seqMcFpr= [ float(fp)/pos if pos!=0 else 0 for fp,pos in zip(seqMcType1Errors, seqMcNumRejected)]
            mcFdrFpr= [ float(fp)/pos if pos!=0 else 0 for fp,pos in zip(mcFdrType1Errors, mcFdrNumRejected)]
            
            fprPlotStaticFile = GalaxyRunSpecificFile(['fpr.png'],galaxyFn)
            fprPlotStaticFile.plotRLines(sortedKeys, [onlyMaxFpr, seqMcFpr, mcFdrFpr], legend=['BasicMc','SeqMc','McFdr'])
            print fprPlotStaticFile.getLink('View FPR plot') + ' as function of number of true H1.', '<br>'
 def runBinaryClassificationSuiteEvaluation(self, algorithmNames, predictionTrackNames, 
                 answerTrackNames, regionTrackNames, overlapAnalysisDef, ROCanalysisDef):
     
     # Number of test sets and number of algorithms to evaluate
     nTestSets = len(answerTrackNames)
     nAlgorithms = len(predictionTrackNames)/nTestSets
     # Initialize list data structures
     resultFiles = []
     globalOverlapResults = []
     globalEqOverlapResults = []
     globalRocResults = []
     tmpAlgorithmNames = []
     number = 1000000000000
     
     statPlot = StatisticPlot()
     globalResultFile = GalaxyRunSpecificFile(['globalResults.html'], self._galaxyFn)
     
     # Initialize the global result lists, which collects localResults across all test sets
     for i in range(0, nAlgorithms):
         
         tmpAlgorithmNames.append(algorithmNames[i*nTestSets])
         globalOverlapResults.append(OrderedDict(zip(['Neither','Only1','Only2','Both'] , (0,0,0,0))))
         globalEqOverlapResults.append(OrderedDict(zip(['Neither','Only1','Only2','Both'] , (0,0,0,0))))
         globalRocResults.append({'Result': []})
     
     algorithmNames = tmpAlgorithmNames
     
     # For all test sets...
     for i in range(0, nTestSets):
         # Create a result file for this test set
         resultFile = GalaxyRunSpecificFile(['testset%d.html' % i], self._galaxyFn)
         localOverlapResults = []
         localRocResults = []
         answerTrackName = answerTrackNames[i]
         regionTrackName = regionTrackNames[i]
         
         # Evaluate the predictions for every algorithm for this test set
         for j in range(0, nAlgorithms):
             predictionTrackName = predictionTrackNames[(j*nTestSets)+i]
             
             # Run statistics for to compute overlap and ROC values
             localOverlapResult = self._runSingleStatistic(regionTrackName, overlapAnalysisDef,
                                 predictionTrackName, answerTrackName)
             
             if self._isRocCurveCompatible(predictionTrackName):
                 localRocResult = self._runSingleStatistic(regionTrackName, ROCanalysisDef,
                                                 predictionTrackName, answerTrackName)
             else:
                 localRocResult = None
             
             # Collect the local results and global add to global results
             localOverlapResults.append(localOverlapResult)
             localRocResults.append(localRocResult)
             
             globalOverlapResults[j]['Neither'] = globalOverlapResults[j]['Neither'] + localOverlapResult['Neither']
             globalOverlapResults[j]['Only1'] = globalOverlapResults[j]['Only1'] + localOverlapResult['Only1']
             globalOverlapResults[j]['Only2'] = globalOverlapResults[j]['Only2'] + localOverlapResult['Only2']
             globalOverlapResults[j]['Both'] = globalOverlapResults[j]['Both'] + localOverlapResult['Both']
             
             testSetLength = localOverlapResult['Neither'] + localOverlapResult['Only1'] + localOverlapResult['Only2'] + localOverlapResult['Both']
             
             globalEqOverlapResults[j]['Neither'] = globalEqOverlapResults[j]['Neither'] + long(localOverlapResult['Neither']*number)/testSetLength
             globalEqOverlapResults[j]['Only1'] = globalEqOverlapResults[j]['Only1'] + long(localOverlapResult['Only1']*number)/testSetLength
             globalEqOverlapResults[j]['Only2'] = globalEqOverlapResults[j]['Only2'] + long(localOverlapResult['Only2']*number)/testSetLength
             globalEqOverlapResults[j]['Both'] = globalEqOverlapResults[j]['Both'] + long(localOverlapResult['Both']*number)/testSetLength
             
             if localRocResult != None:
                 globalRocResults[j]['Result'] = globalRocResults[j]['Result'] + localRocResult['Result']
         
         # Create statistics for this test set
         localStatisticsLink = statPlot.createBinaryClassificationStatistics(i, 
                 algorithmNames, localOverlapResults, self._galaxyFn, 'Benchmark statistics')
         
         totalPositives, totalNegatives = self._getTotalNegativesAndPositivesFromOverlapResults(localOverlapResults)
     
         localRocCurveLink = statPlot.createROCCurve(i, algorithmNames, 
                         totalPositives, totalNegatives, localRocResults, self._galaxyFn)
         
         # Write statistical information for this test set to file
         resultFile.writeTextToFile('%s</br>%s' % (localStatisticsLink, localRocCurveLink), 'w')
         resultFiles.append(resultFile)
     
     # Create statistics for all test sets
     globalStatisticsLink = statPlot.createBinaryClassificationStatistics(nTestSets, 
                 algorithmNames, globalOverlapResults, self._galaxyFn, 
                 'Benchmark statistics (sum, longer test set has higher weight)')
     
     globalEqStatisticsLink = statPlot.createBinaryClassificationStatistics(nTestSets+1, 
                 algorithmNames, globalEqOverlapResults, self._galaxyFn, 
                 'Benchmark statistics (same weight for each test set)')
     
     totalPositives, totalNegatives = self._getTotalNegativesAndPositivesFromOverlapResults(globalOverlapResults)
     
     globalRocCurveLink = statPlot.createROCCurve(nTestSets, algorithmNames, 
                         totalPositives, totalNegatives, globalRocResults, self._galaxyFn)
     
     # Write statistical information for all test sets to file
     globalResultFile.writeTextToFile('%s</br>%s</br>%s' % (globalStatisticsLink, globalEqStatisticsLink, globalRocCurveLink), 'w')
     
     # Add all result files to a result list, and return
     results = []
     
     results.append(globalResultFile.getLink('Global results\n\n'))
     
     for i in range(0, len(resultFiles)):
         results.append(resultFiles[i].getLink('Test set %d' % (i+1)))
     
     return results
Exemple #26
0
    def singleSimulation(self, numH0, numH1, replicateIndex, verbose=False):
        tests = MultipleTestCollection(numH0, numH1, self._maxNumSamples, self._h, self._fdrThreshold,self._a,self._b)
        tests.addSamples(self.NUM_SAMPLES_INITIALLY)
        while not tests.allTestsAreDetermined():            
            tests.addSamples(self.NUM_SAMPLES_PER_CHUNK)
            #if verbose:
                #print tests.getTotalNumSamples()
        #As sampling is now anyway over, we set fdrThreshold to a threshold used after computations are finished (i.e. affects final rejection/acception, but not stopping of samples)
        tests.setFdrThresholdAtAllCounters(self._postFdrThreshold)
        
        #print 'FINALLY, #samples: ',
        if self._galaxyFn is not None:
            if self._h is None:
                scheme = 'Basic'
            elif self._fdrThreshold is None:
                scheme = 'Sequential'
            else:
                scheme = 'McFdr'
            staticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.txt'], self._galaxyFn)              
            tests.writeAllPandQVals(staticFile.getFile() )                        
            linkToRaw = staticFile.getLink('Raw p and q-vals') + ' under %s scheme with %i true H1, (replication %i)' % (scheme, numH1, replicateIndex)
            
            figStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.png'], self._galaxyFn)
            figStaticFile.openRFigure()
            tests.makeAllPandQValsFigure()
            figStaticFile.closeRFigure()
            linkToFig = figStaticFile.getLink(' (p/q-figure) ') + '<br>'

            figNumSamplesStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'NumSamples.png'], self._galaxyFn)
            figNumSamplesStaticFile.openRFigure()
            tests.makeNumSamplesFigure()
            figNumSamplesStaticFile.closeRFigure()
            linkToNumSamplesFig = figNumSamplesStaticFile.getLink(' (numSamples-figure) ') + '<br>'

            catalogStaticFile = GalaxyRunSpecificFile([str(numH1),'cat.html'], self._galaxyFn)
            catalogStaticFile.writeTextToFile(linkToRaw + linkToFig + linkToNumSamplesFig, mode='a')

                        
        #if verbose:
            #print sorted(tests.getFdrVals())
            #print 'NumS ign Below 0.2: ', sum([1 if t<0.2 else 0 for t in tests.getFdrVals()])
        #return tests.getTotalNumSamples(), tests.getTotalNumRejected()
        return tests.getTotalNumSamples(), tests.getTotalNumRejected(), tests.getClassificationSummaries()
 def getGeneIdStaticFileWithContent(self):
     targetBins = self.getIntersectedReferenceBins()
     idFileNamer = GalaxyRunSpecificFile(['allGeneIds.txt'],self._galaxyFn)
     idFileNamer.writeTextToFile(os.linesep.join([str(bin.val).split('|')[0] for bin in targetBins]) + os.linesep)
     return idFileNamer
 def getIntersectedRegionsStaticFileWithContent(self):
     intersectedRegs = self.getIntersectedReferenceBins()
     staticFile = GalaxyRunSpecificFile(['intersected_regions.bed'],self._galaxyFn)
     self.writeRegionListToBedFile(intersectedRegs, staticFile.getDiskPath() )
     return staticFile
    def execute(cls, choices, galaxyFn=None, username=''):
        from quick.application.GalaxyInterface import GalaxyInterface

        fileformat = choices[9];
        outputFile = open(galaxyFn, "w")
        
        if fileformat == "html":
            print GalaxyInterface.getHtmlBeginForRuns(galaxyFn)
            print GalaxyInterface.getHtmlForToggles(withRunDescription=False)
            t = calendar.timegm(time.gmtime())
            htmlfile = GalaxyRunSpecificFile(["css", str(t)], galaxyFn);


        genome = choices[0]
        track1 = choices[1].split(":")
        track2 = choices[2].split(":")
        tn1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, track1)
        tn2 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, track2)

        compare = choices[3] != "Count individual SNP-differences in window"
        if choices[4] == "Classical MDS":
            mds = 0;
        elif choices[4] == "SMACOF":
            mds = 1;
        else:
            mds = 2;
        windowSize = int(choices[5])
        windowStep = int(choices[6])
        
        mcTreshold = int(choices[7])
        mcRuns = int(choices[8])

        outputFile.write("#seqid\tstart\tscore\tp\n")
        if fileformat == "html":
            text = "#seqid\tstart\tscore\tp\n";

	print "chrs:"+str(GenomeInfo.getChrList(genome))
        reg = "*"
        bins = "*"
        analysisDef = "Dummy: dummy name ([wStep=%g] [wSize=%s] [func=%s] [mds=%s] [mcT=%s] [mcR=%s])-> CategoryClusterSeparationStat" % (windowStep, windowSize, compare, mds, mcTreshold, mcRuns)
        userBinSource = GalaxyInterface._getUserBinSource(reg, bins, genome)
        result = GalaxyInterface.runManual([tn1, tn2], analysisDef, reg, bins, genome, galaxyFn=galaxyFn)
        for key in result.getAllRegionKeys():
            chrom = str(key).split(":")[0];
            r = result[key];
            if 'Result' not in r.keys():
                print "skipping chr:", chrom, r;
                continue;
            r = r['Result'];
            scores = r[0];
            stddev = r[1];
            for i in range(len(scores)):
                if scores[i] != 0:
                    pos = i*windowStep;
                    if fileformat == "tabular":
                        outputFile.write("%s\t%s\t%s\t%s\n" % (str(chrom), pos, str(scores[i]), str(stddev[i])))
                    else:
                        text += "%s\t%s\t%s\t%s\n" % (str(chrom), pos, str(scores[i]), str(stddev[i]));
        if fileformat == "html":
            htmlfile.writeTextToFile(text);
            print htmlfile.getLink("Result file");
            print GalaxyInterface.getHtmlEndForRuns()
        
        outputFile.close();
    def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn))

        assert genome in ['mm9','hg18'] #other genomes not supported. TF id links do not specify genome for pre-selection of analysis
        
        #if tfSource == 'UCSC tfbs conserved':
        #    tfTrackName = ['Gene regulation','TFBS','UCSC prediction track']
        #else:
        #    raise
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]
                
        #Get gene track
        #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn )
        
        if not (upFlankSize == downFlankSize == 0):            
            unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed'
            flankedGeneRegsTempFn  = uniqueWebPath + os.sep + 'flankedGeneRegs.bed'
            geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
            #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
            GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, unflankedGeneRegsTempFn )
            GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn, flankedGeneRegsTempFn, genome, upFlankSize, downFlankSize)
            #flankedGeneRegsExternalTN = ['external'] +galaxyId +  [flankedGeneRegsTempFn]
            regSpec, binSpec = 'file', flankedGeneRegsTempFn
        else:
            regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList)

        res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName)

        #trackName1 = tfTrackName
        #
        #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\
        #          '[tf1:=SegmentToStartPointFormatConverter:]'+\
        #          '-> FreqByCatStat'
        ##assert len(ensembleGeneIdList)==1
        ##geneId = ensembleGeneIdList[0]
        #
        #print '<div class="debug">'        
        #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome)
        #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run()
        #
        #print res        
        ##GalaxyInterface._viewResults([res], galaxyFn)
        #print '</div>'
        tfs = res.getResDictKeys()
        
        genesPlural = 's' if len(ensembleGeneIdList)>1 else ''
        tfsPlural = 's' if len(tfs)!=1 else ''
        print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % (len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList), tfSource)
        
        expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else ''                

        idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'],galaxyFn)
        idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>'%(URL_PREFIX, genome, quote(':'.join(tfTrackName+[tf])), tf) for tf in tfs]))
        #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs]))
        print '<p>', idHtmlFileNamer.getLink('Inspect html file'), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (expansionStr, genesPlural)

        idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'],galaxyFn)
        idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep)
        print '<p>', idFileNamer.getLink('Inspect text file'), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural)
    
        extractedTfbsFileNamer = GalaxyRunSpecificFile(['tfbsInGeneRegions.bed'],galaxyFn)
        GalaxyInterface.extractTrackManyBins(genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath())
        print '<p>', extractedTfbsFileNamer.getLink('Inspect bed-file'), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural)
        
        #idFile = idFileNamer.getFile()
        #idFile.write(', '.join([str(bin.val) for bin in targetBins if res[bin][resDictKey]>0]) + os.sep)
        #idFile.close()
        
        #print idFileNamer.getLink('Text file'), ' of TF IDs'
        
        #GalaxyInterface.run(tfTrackName, tcGeneRegsExternalTN, analysisDef, regSpec, binSpec, genome, galaxyFn)
        #GalaxyInterface.run(':'.join(tfTrackName), ':'.join(tcGeneRegsExternalTN), analysisDef, regSpec, binSpec, genome, galaxyFn)