Exemple #1
0
    def singleSimulation(self, numH0, numH1, replicateIndex, verbose=False):
        tests = MultipleTestCollection(numH0, numH1, self._maxNumSamples, self._h, self._fdrThreshold,self._a,self._b)
        tests.addSamples(self.NUM_SAMPLES_INITIALLY)
        while not tests.allTestsAreDetermined():            
            tests.addSamples(self.NUM_SAMPLES_PER_CHUNK)
            #if verbose:
                #print tests.getTotalNumSamples()
        #As sampling is now anyway over, we set fdrThreshold to a threshold used after computations are finished (i.e. affects final rejection/acception, but not stopping of samples)
        tests.setFdrThresholdAtAllCounters(self._postFdrThreshold)
        
        #print 'FINALLY, #samples: ',
        if self._galaxyFn is not None:
            if self._h is None:
                scheme = 'Basic'
            elif self._fdrThreshold is None:
                scheme = 'Sequential'
            else:
                scheme = 'McFdr'
            staticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.txt'], self._galaxyFn)              
            tests.writeAllPandQVals(staticFile.getFile() )                        
            linkToRaw = staticFile.getLink('Raw p and q-vals') + ' under %s scheme with %i true H1, (replication %i)' % (scheme, numH1, replicateIndex)
            
            figStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.png'], self._galaxyFn)
            figStaticFile.openRFigure()
            tests.makeAllPandQValsFigure()
            figStaticFile.closeRFigure()
            linkToFig = figStaticFile.getLink(' (p/q-figure) ') + '<br>'

            figNumSamplesStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'NumSamples.png'], self._galaxyFn)
            figNumSamplesStaticFile.openRFigure()
            tests.makeNumSamplesFigure()
            figNumSamplesStaticFile.closeRFigure()
            linkToNumSamplesFig = figNumSamplesStaticFile.getLink(' (numSamples-figure) ') + '<br>'

            catalogStaticFile = GalaxyRunSpecificFile([str(numH1),'cat.html'], self._galaxyFn)
            catalogStaticFile.writeTextToFile(linkToRaw + linkToFig + linkToNumSamplesFig, mode='a')

                        
        #if verbose:
            #print sorted(tests.getFdrVals())
            #print 'NumS ign Below 0.2: ', sum([1 if t<0.2 else 0 for t in tests.getFdrVals()])
        #return tests.getTotalNumSamples(), tests.getTotalNumRejected()
        return tests.getTotalNumSamples(), tests.getTotalNumRejected(), tests.getClassificationSummaries()
Exemple #2
0
    def compareCutoffSchemes(maxNumSamples, h, fdrThreshold, totalNumTests, stepSize, numReplications,a,b, galaxyFn=None):
        print '<PRE>'
        print 'Comparing cutoff schemes with parameters: maxNumSamples=%i, h=%i, fdrThreshold=%.2f, totalNumTests=%i, numReplications=%i' % (maxNumSamples, h, fdrThreshold, totalNumTests, numReplications)
        print 'stepSize: ',stepSize
        print 'H1 p-values drawn from beta with a=%.3f and b=%.3f' % (a,b)
        print 'Minimum achieveable p-value is %.5f, which gives minimum Bonferroni-corrected p-value of %.5f (compares to a fdr threshold of %.2f)' % (1.0/maxNumSamples, (1.0/maxNumSamples)*totalNumTests, fdrThreshold)
        
        #estimate time use:
        prevTime= time.time()
        Simulator(maxNumSamples, None, None,a,b,fdrThreshold).numSamplesAsFunctionOfNumH1( 1, 1, 1)
        baseMeasure = time.time() - prevTime
        if type(stepSize)==int:
            numSteps = len(range(0,totalNumTests+1,stepSize))
        elif type(stepSize)==list:
            numSteps = len(stepSize)
        withOnlyMaxNumEstimate = baseMeasure * totalNumTests * numSteps * numReplications
        #print 'Estimated running time: between %i and %i seconds.' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate*3)
        print 'Estimated running time: around %i seconds. (%.1f hours)' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate/3600.0)
        
        sortedKeys, onlyMaxCutoff, onlyMaxNumRejected, onlyMaxType1Errors, onlyMaxType2Errors = Simulator(maxNumSamples, None, None,a,b,fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1( totalNumTests, stepSize, numReplications)
        sortedKeys, seqMcCutoff, seqMcNumRejected, seqMcType1Errors, seqMcType2Errors  = Simulator(maxNumSamples, h, None,a,b,fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications)
        sortedKeys, mcFdrCutoff, mcFdrNumRejected, mcFdrType1Errors, mcFdrType2Errors  = Simulator(None, h, fdrThreshold,a,b,fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications)
        maxY = max( max(s) for s in [onlyMaxCutoff, seqMcCutoff, mcFdrCutoff])
        #minY = min( min(s) for s in [onlyMaxCutoff, seqMcCutoff, McFdrCutoff])
        minY=0

        print 'Time spent: ',time.time() - prevTime, ' secs'
        print '</PRE>'
        
        #plotStaticFile.getDiskPath(True)
        if galaxyFn is not None:
            #print 'Generating aggregate McFdr simulation figures'
            plotStaticFile = GalaxyRunSpecificFile(['mainPlot.png'],galaxyFn)
            if type(stepSize) is int:
                allNumH1s = range(0,totalNumTests+1,stepSize)
            elif type(stepSize) is list:
                allNumH1s = stepSize
            for numH1 in allNumH1s:
                catalogStaticFile = GalaxyRunSpecificFile([str(numH1),'cat.html'], galaxyFn)
                print catalogStaticFile.getLink( 'Tests with #True H1s=%i' % numH1 ), '<br>'

            #plotStaticFile.openRFigure()
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxCutoff), ylim=r.unlist([minY,maxY]), type='l', xlab='Number of true H1s', ylab='Total MC samples' , col='black')
            #r.lines(r.unlist(sortedKeys), r.unlist(seqMcCutoff), col='red' )
            #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrCutoff), col='green' )
            #r.legend('topleft',['BasicMc','SeqMc','McFdr'],col=['black','red','green'],lty=1)
            plotStaticFile.plotRLines(sortedKeys, [onlyMaxCutoff,seqMcCutoff,mcFdrCutoff], xlab='Number of true H1s', ylab='Total MC samples', legend=['BasicMc','SeqMc','McFdr'])
            #r('dev.off()')
            #plotStaticFile.closeRFigure()

            print plotStaticFile.getLink('View main plot') + ' of sumSamples as function of #H1s.', '<br>'

            numRejectedPlotStaticFile = GalaxyRunSpecificFile(['secondaryPlot.png'],galaxyFn)
            numRejectedPlotStaticFile.plotRLines(sortedKeys, [onlyMaxNumRejected,seqMcNumRejected,mcFdrNumRejected],xlab='Number of true H1s', ylab='Num rejected tests',legend=['BasicMc','SeqMc','McFdr'])
            #numRejectedPlotStaticFile.openRFigure()
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxNumRejected), ylim=r.unlist([0,totalNumTests]), type='l', xlab='Number of true H1s', ylab='Num rejected tests',col='black' )
            #r.lines(r.unlist(sortedKeys), r.unlist(seqMcNumRejected), col='red' )
            #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrNumRejected), col='green' )
            #r.lines(r.unlist(sortedKeys), r.unlist(sortedKeys), col='black', lty='dotted' ) #As this corresponds to perfect estimation..
            #r.legend('topleft',['BasicMc','SeqMc','McFdr','NumFromH1'],col=['black','red','green','black'],lty=[1,1,1,2])
            #r('dev.off()')
            #numRejectedPlotStaticFile.closeRFigure()
            print numRejectedPlotStaticFile.getLink('View secondary plot') + ' of #true H1s vs #tests rejected.', '<br>'

            #Classification errors
            classificationErrorPlotStaticFile = GalaxyRunSpecificFile(['errors.png'],galaxyFn)
            classificationErrorPlotStaticFile.openRFigure()
            yMax = max( max(x) for x in [mcFdrType2Errors,mcFdrType1Errors,seqMcType2Errors,seqMcType1Errors,onlyMaxType2Errors,onlyMaxType1Errors ])
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxType1Errors), ylim=r.unlist([0,yMax]), type='l', xlab='Number of true H1s', ylab='Type 1/2 errors',col='black' )
            r.lines(r.unlist(sortedKeys), r.unlist(onlyMaxType2Errors), col='black', lty='dotted' )
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcType1Errors), col='red' )
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcType2Errors), col='red', lty='dotted' )
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrType1Errors), col='green' )
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrType2Errors), col='green', lty='dotted' )
            rpy1.legend('topleft',['BasicMcType1','SeqMcType1','McFdrType1','BasicMcType2','SeqMcType2','McFdrType2'],col=['black','red','green','black','red','green'],lty=[1,1,1,2,2,2])
            #r('dev.off()')
            classificationErrorPlotStaticFile.closeRFigure()
            print classificationErrorPlotStaticFile.getLink('View Type 1/2 error plot') + ' as function of number of true H1.', '<br>'

            #Classification errors
            onlyMaxAccuracy = [ sum(errors)*1.0/totalNumTests for errors in zip(onlyMaxType1Errors, onlyMaxType2Errors)]
            seqMcAccuracy = [ sum(errors)*1.0/totalNumTests for errors in zip(seqMcType1Errors, seqMcType2Errors)]
            mcFdrAccuracy = [ sum(errors)*1.0/totalNumTests for errors in zip(mcFdrType1Errors, mcFdrType2Errors)]
            
            accuracyPlotStaticFile = GalaxyRunSpecificFile(['accuracy.png'],galaxyFn)
            accuracyPlotStaticFile.openRFigure()
            yMax = 0.2 #just set ad hoc here..
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxAccuracy), ylim=r.unlist([0,yMax]), type='l', xlab='Number of true H1s', ylab='Accuracy',col='black' )
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcAccuracy), col='red' )
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrAccuracy), col='green' )
            rpy1.legend('topleft',['BasicMc','SeqMc','McFdr','NumFromH1'],col=['black','red','green'],lty=[1,1,1])
            #r('dev.off()')
            accuracyPlotStaticFile.closeRFigure()
            print accuracyPlotStaticFile.getLink('View accuracy plot') + ' as function of number of true H1.', '<br>'
                        
            #False positive rates
            onlyMaxFpr= [ float(fp)/pos if pos!=0 else 0 for fp,pos in zip(onlyMaxType1Errors, onlyMaxNumRejected)]
            seqMcFpr= [ float(fp)/pos if pos!=0 else 0 for fp,pos in zip(seqMcType1Errors, seqMcNumRejected)]
            mcFdrFpr= [ float(fp)/pos if pos!=0 else 0 for fp,pos in zip(mcFdrType1Errors, mcFdrNumRejected)]
            
            fprPlotStaticFile = GalaxyRunSpecificFile(['fpr.png'],galaxyFn)
            fprPlotStaticFile.plotRLines(sortedKeys, [onlyMaxFpr, seqMcFpr, mcFdrFpr], legend=['BasicMc','SeqMc','McFdr'])
            print fprPlotStaticFile.getLink('View FPR plot') + ' as function of number of true H1.', '<br>'
    def execute(choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.gtr
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''
        print 'temporarily overriding tool, running McFdr2 simulation..'
        from test.sandbox.aux.McFdr2 import analyzeSampleNumAccuracy
        for numSamples in [100,1000,10000]:
            print ''
            print 'numSamples %s: ' % numSamples,
            for i in range(3):
                print analyzeSampleNumAccuracy(numSamples),
        return
    
    
        from gold.application.RSetup import r
        from numpy import array,minimum
        pVal,minNumSamples,maxNumSamples,chunkSize,numTests = [float(x) for x in choices[:-1]]
        print 'pVal:%.2f, minNumSamples:%i, maxNumSamples:%i, chunkSize:%i, numTests:%i' % (pVal,minNumSamples,maxNumSamples,chunkSize,numTests)
        
        assert (maxNumSamples-minNumSamples)%chunkSize == 0
        assert numTests == 1 #More not yet supported. Should in McFdr be something like the min-max, i.e. the minimum across iterations of the maximum p-value across tests..
        
        pValEstimation = choices[-1]
        assert pValEstimation in ['Davison','ML']
        if pValEstimation=='Davison':
            pFunc = lambda k,n:1.0*(k+1)/(n+1)
        else:
            pFunc = lambda k,n:1.0*(k)/n
            
        numRepl = 10**4
        stdAtMin = [pFunc(k,minNumSamples) for k in r.rbinom(numRepl,minNumSamples,pVal)]
        stdAtMax = [pFunc(k,maxNumSamples) for k in r.rbinom(numRepl,maxNumSamples,pVal)]
        
        mcFdrBestPVals = array([1.0]*numRepl)
        mcFdrSamples = minNumSamples #array([minNumSamples]*numRepl)
        mcFdrExtremes = array(r.rbinom(numRepl,minNumSamples,pVal))
        while mcFdrSamples<maxNumSamples:
            tempMcFdrPVals = pFunc(mcFdrExtremes,mcFdrSamples)
            mcFdrBestPVals = minimum(mcFdrBestPVals,tempMcFdrPVals)
            
            mcFdrSamples += chunkSize
            mcFdrExtremes += array(r.rbinom(numRepl,chunkSize,pVal))
        tempMcFdrPVals = pFunc(mcFdrExtremes,mcFdrSamples)
        mcFdrBestPVals = minimum(mcFdrBestPVals,tempMcFdrPVals)
        assert mcFdrSamples == maxNumSamples
        
        print 'Mean values<br>'
        print 'AtMin:%.7f, AtMax:%.7f, McFdr:%.7f' % tuple([array(x).mean() for x in [stdAtMin, stdAtMax, mcFdrBestPVals]])
        
        breaks = [pVal*2*x/100.0 for x in range(0,101)] +[1.0]
        
        histRes = r.hist(stdAtMin,breaks=breaks,plot=False)
        xVals = histRes['mids']
        yValsStdAtMin = histRes['density']

        histRes = r.hist(stdAtMax,breaks=breaks,plot=False)
        assert xVals == histRes['mids']
        yValsStdAtMax = histRes['density']

        histRes = r.hist(mcFdrBestPVals,breaks=breaks,plot=False)
        assert xVals == histRes['mids']
        yValsMcFdr = histRes['density']
        
        staticFile = GalaxyRunSpecificFile(['pDistr.png'],galaxyFn)
        staticFile.openRFigure()
        staticFile.plotRLines(xVals, [yValsStdAtMin, yValsStdAtMax, yValsMcFdr],alsoOpenAndClose=False,xlab='p-value',ylab='density',xlim=[0,2*pVal])
        r.abline(v=pVal,lty='dotted',col='yellow')
        staticFile.closeRFigure()
        print staticFile.getLink('View estimated pval distribution')