Ejemplo n.º 1
0
    def run(self, analysis, selectedTasks, **kwargs):

        taskScoreData = self._examResults.getTaskList()
        studentsData = self._examResults.getStudentList()

        if len(taskScoreData) == 0 or len(studentsData) == 0:
            print 'Empty data'
            return None

        examMaxScore = 0.0
        for task in taskScoreData:
            examMaxScore += task.getMaxScore()

        rawDataDict = self.getTaskScoresPerGradeRawData(
            taskScoreData, studentsData)
        if analysis == self.ANALYSIS_AVG_SCORE_PER_GRADE_LINE_PLOT:
            plotDataDict = self.getPlotDataFromRawData(rawDataDict,
                                                       selectedTasks,
                                                       examMaxScore)
            maxPercentage = 10
            for taskScores in plotDataDict.values():
                if max(taskScores) > maxPercentage:
                    maxPercentage = max(taskScores)
            plotOutput = GalaxyRunSpecificFile(
                ['IndividualTaskAnalysis', 'lineplot.png'], self._galaxyFN)
            plotOutput.openRFigure(h=600, w=720)
            xTitle = 'Task'
            yTitle = 'Avg score per grade in percentage from total exam score'
            RPlotUtil.drawLineplot(plotDataDict, analysis, selectedTasks,
                                   maxPercentage, xTitle, yTitle)
            RPlotUtil.rDevOff()
            self._plotUrls.append(plotOutput.getURL())
            self._plotPaths.append(plotOutput.getDiskPath())

        elif analysis == self.ANALYSIS_SCORE_DISTRIBUTION_PER_GRADE:
            for taskName in selectedTasks:
                plotOutput = GalaxyRunSpecificFile(
                    ['IndividualTaskAnalysis', taskName + '_vioplot.png'],
                    self._galaxyFN)
                plotOutput.openRFigure(h=600, w=720)
                task = self._examResults.getTask(taskName)
                maxScore = task.getMaxScore()
                plotDataMatrix = []
                for grade in self.GRADES:
                    plotDataMatrix.append(rawDataDict[grade][taskName])

                mainTitle = 'Task: ' + taskName + ' - Vioplot of task scores per grade'
                xTitle = 'Grade'
                yTitle = 'Percentage score (from max task score)'
                vioplotColor = 'magenta'
                xAxisAt = [x + 1 for x in range(len(self.GRADES))]
                xLimMin = 0
                xLimMax = len(self.GRADES) + 1
                xLas = 1
                from math import ceil
                yAxisAt = range(0, int(ceil(maxScore)))
                yLimMin = 0
                yLimMax = maxScore
                yLas = 1

                #plot is used to setup the chart for vioplot so we can control the x and y labels
                RPlotUtil.drawVioplot(plotDataMatrix, self.GRADES, mainTitle,
                                      xTitle, yTitle, vioplotColor, xAxisAt,
                                      xLimMin, xLimMax, xLas, yAxisAt, yLimMin,
                                      yLimMax, yLas)

                RPlotUtil.rDevOff()
                self._plotUrls.append(plotOutput.getURL())
                self._plotPaths.append(plotOutput.getDiskPath())
        elif analysis == self.ANALYSIS_HISTOGRAM_PLOT:
            data = []
            for taskName in selectedTasks:
                task = self._examResults.getTask(taskName)
                data.append(task.getPercentScoresList())

            plotOutput = GalaxyRunSpecificFile(
                ['IndividualTaskAnalysis', 'histogram.png'], self._galaxyFN)
            plotOutput.openRFigure(h=600, w=720)

            mainTitle = 'Task scores distribution'
            xTitle = 'Task scores 0%-100%'
            yTitle = 'Score count (per bin)'
            colors = RPlotUtil.getRainbowColors(len(selectedTasks))
            RPlotUtil.drawMultiHistogram(data,
                                         mainTitle,
                                         xTitle,
                                         yTitle,
                                         names=selectedTasks,
                                         colors=colors,
                                         hasLegend=True)

            RPlotUtil.rDevOff()
            self._plotUrls.append(plotOutput.getURL())
            self._plotPaths.append(plotOutput.getDiskPath())

        else:
            #         elif analysis == self.ANALYSIS_BIN_AVG_SMOOTHED_PLOT:
            #sanity check
            examMaxScore = self._examResults.getExamMaxScore()
            if examMaxScore == 0:
                raise InvalidDataException(
                    'Exam max score must be larger than 0. Most probably no task were defined in the results input file.'
                )

            bins = int(kwargs['bins'])
            displayPoints = bool(kwargs['displayPoints'])
            spar = float(kwargs['spar'])
            verticalLines = kwargs['verticalLines']

            colors = RPlotUtil.getRainbowColors(len(selectedTasks))
            plotOutput = GalaxyRunSpecificFile(
                ['IndividualTaskAnalysis', 'smoothed_line_plot.png'],
                self._galaxyFN)
            plotOutput.openRFigure(h=600, w=720)
            xData = []
            students = self._examResults.getStudentList()
            for student in students:
                xData.append(
                    self._examResults.getExamScorePercentageForStudent(
                        student.getStudentNr()))

            mainTitle = 'Smoothed line plot - task score vs exam score'
            if analysis == self.ANALYSIS_BIN_AVG_SMOOTHED_PLOT:
                mainTitle += ' (Bin average)'
            else:
                mainTitle += ' (Moving average)'
            xTitle = 'Exam score (%)'
            yTitle = 'Task score (%)'
            xLim = [0, 100]
            yLim = [0, 100]
            RPlotUtil.drawEmptyPlot(xTitle, yTitle, mainTitle, xLim, yLim)
            RPlotUtil.drawLegend('topleft', selectedTasks, colors)

            for taskName, col in zip(selectedTasks, colors):
                task = self._examResults.getTask(taskName)
                yData = []
                for student in students:
                    yData.append(
                        task.getPercentageScore(student.getStudentNr()))

                if analysis == self.ANALYSIS_BIN_AVG_SMOOTHED_PLOT:
                    RPlotUtil.drawBinnedSmoothedLinePlot(
                        xData,
                        yData,
                        col=col,
                        bins=bins,
                        displayPoints=displayPoints,
                        spar=spar)
                else:  #analysis == self.ANALYSIS_AVG_SCORE_PER_GRADE_LINE_PLOT
                    RPlotUtil.drawMovingAvgSmoothedLinePlot(
                        xData,
                        yData,
                        col,
                        displayPoints=displayPoints,
                        spar=spar)

            if verticalLines:
                for verticalLine in verticalLines:
                    RPlotUtil.drawVerticalLine(verticalLine)

            RPlotUtil.rDevOff()
            self._plotUrls.append(plotOutput.getURL())
            self._plotPaths.append(plotOutput.getDiskPath())
Ejemplo n.º 2
0
    def compareCutoffSchemes(maxNumSamples,
                             h,
                             fdrThreshold,
                             totalNumTests,
                             stepSize,
                             numReplications,
                             a,
                             b,
                             galaxyFn=None):
        print '<PRE>'
        print 'Comparing cutoff schemes with parameters: maxNumSamples=%i, h=%i, fdrThreshold=%.2f, totalNumTests=%i, numReplications=%i' % (
            maxNumSamples, h, fdrThreshold, totalNumTests, numReplications)
        print 'stepSize: ', stepSize
        print 'H1 p-values drawn from beta with a=%.3f and b=%.3f' % (a, b)
        print 'Minimum achieveable p-value is %.5f, which gives minimum Bonferroni-corrected p-value of %.5f (compares to a fdr threshold of %.2f)' % (
            1.0 / maxNumSamples,
            (1.0 / maxNumSamples) * totalNumTests, fdrThreshold)

        #estimate time use:
        prevTime = time.time()
        Simulator(maxNumSamples, None, None, a, b,
                  fdrThreshold).numSamplesAsFunctionOfNumH1(1, 1, 1)
        baseMeasure = time.time() - prevTime
        if type(stepSize) == int:
            numSteps = len(range(0, totalNumTests + 1, stepSize))
        elif type(stepSize) == list:
            numSteps = len(stepSize)
        withOnlyMaxNumEstimate = baseMeasure * totalNumTests * numSteps * numReplications
        #print 'Estimated running time: between %i and %i seconds.' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate*3)
        print 'Estimated running time: around %i seconds. (%.1f hours)' % (
            withOnlyMaxNumEstimate, withOnlyMaxNumEstimate / 3600.0)

        sortedKeys, onlyMaxCutoff, onlyMaxNumRejected, onlyMaxType1Errors, onlyMaxType2Errors = Simulator(
            maxNumSamples, None, None, a, b, fdrThreshold,
            galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize,
                                                  numReplications)
        sortedKeys, seqMcCutoff, seqMcNumRejected, seqMcType1Errors, seqMcType2Errors = Simulator(
            maxNumSamples, h, None, a, b, fdrThreshold,
            galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize,
                                                  numReplications)
        sortedKeys, mcFdrCutoff, mcFdrNumRejected, mcFdrType1Errors, mcFdrType2Errors = Simulator(
            None, h, fdrThreshold, a, b, fdrThreshold,
            galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize,
                                                  numReplications)
        maxY = max(max(s) for s in [onlyMaxCutoff, seqMcCutoff, mcFdrCutoff])
        #minY = min( min(s) for s in [onlyMaxCutoff, seqMcCutoff, McFdrCutoff])
        minY = 0

        print 'Time spent: ', time.time() - prevTime, ' secs'
        print '</PRE>'

        #plotStaticFile.getDiskPath(True)
        if galaxyFn is not None:
            #print 'Generating aggregate McFdr simulation figures'
            plotStaticFile = GalaxyRunSpecificFile(['mainPlot.png'], galaxyFn)
            if type(stepSize) is int:
                allNumH1s = range(0, totalNumTests + 1, stepSize)
            elif type(stepSize) is list:
                allNumH1s = stepSize
            for numH1 in allNumH1s:
                catalogStaticFile = GalaxyRunSpecificFile(
                    [str(numH1), 'cat.html'], galaxyFn)
                print catalogStaticFile.getLink('Tests with #True H1s=%i' %
                                                numH1), '<br>'

            #plotStaticFile.openRFigure()
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxCutoff), ylim=r.unlist([minY,maxY]), type='l', xlab='Number of true H1s', ylab='Total MC samples' , col='black')
            #r.lines(r.unlist(sortedKeys), r.unlist(seqMcCutoff), col='red' )
            #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrCutoff), col='green' )
            #r.legend('topleft',['BasicMc','SeqMc','McFdr'],col=['black','red','green'],lty=1)
            plotStaticFile.plotRLines(
                sortedKeys, [onlyMaxCutoff, seqMcCutoff, mcFdrCutoff],
                xlab='Number of true H1s',
                ylab='Total MC samples',
                legend=['BasicMc', 'SeqMc', 'McFdr'])
            #r('dev.off()')
            #plotStaticFile.closeRFigure()

            print plotStaticFile.getLink(
                'View main plot'
            ) + ' of sumSamples as function of #H1s.', '<br>'

            numRejectedPlotStaticFile = GalaxyRunSpecificFile(
                ['secondaryPlot.png'], galaxyFn)
            numRejectedPlotStaticFile.plotRLines(
                sortedKeys,
                [onlyMaxNumRejected, seqMcNumRejected, mcFdrNumRejected],
                xlab='Number of true H1s',
                ylab='Num rejected tests',
                legend=['BasicMc', 'SeqMc', 'McFdr'])
            #numRejectedPlotStaticFile.openRFigure()
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxNumRejected), ylim=r.unlist([0,totalNumTests]), type='l', xlab='Number of true H1s', ylab='Num rejected tests',col='black' )
            #r.lines(r.unlist(sortedKeys), r.unlist(seqMcNumRejected), col='red' )
            #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrNumRejected), col='green' )
            #r.lines(r.unlist(sortedKeys), r.unlist(sortedKeys), col='black', lty='dotted' ) #As this corresponds to perfect estimation..
            #r.legend('topleft',['BasicMc','SeqMc','McFdr','NumFromH1'],col=['black','red','green','black'],lty=[1,1,1,2])
            #r('dev.off()')
            #numRejectedPlotStaticFile.closeRFigure()
            print numRejectedPlotStaticFile.getLink(
                'View secondary plot'
            ) + ' of #true H1s vs #tests rejected.', '<br>'

            #Classification errors
            classificationErrorPlotStaticFile = GalaxyRunSpecificFile(
                ['errors.png'], galaxyFn)
            classificationErrorPlotStaticFile.openRFigure()
            yMax = max(
                max(x) for x in [
                    mcFdrType2Errors, mcFdrType1Errors, seqMcType2Errors,
                    seqMcType1Errors, onlyMaxType2Errors, onlyMaxType1Errors
                ])
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            r.plot(r.unlist(sortedKeys),
                   r.unlist(onlyMaxType1Errors),
                   ylim=r.unlist([0, yMax]),
                   type='l',
                   xlab='Number of true H1s',
                   ylab='Type 1/2 errors',
                   col='black')
            r.lines(r.unlist(sortedKeys),
                    r.unlist(onlyMaxType2Errors),
                    col='black',
                    lty='dotted')
            r.lines(r.unlist(sortedKeys),
                    r.unlist(seqMcType1Errors),
                    col='red')
            r.lines(r.unlist(sortedKeys),
                    r.unlist(seqMcType2Errors),
                    col='red',
                    lty='dotted')
            r.lines(r.unlist(sortedKeys),
                    r.unlist(mcFdrType1Errors),
                    col='green')
            r.lines(r.unlist(sortedKeys),
                    r.unlist(mcFdrType2Errors),
                    col='green',
                    lty='dotted')
            rpy1.legend('topleft', [
                'BasicMcType1', 'SeqMcType1', 'McFdrType1', 'BasicMcType2',
                'SeqMcType2', 'McFdrType2'
            ],
                        col=['black', 'red', 'green', 'black', 'red', 'green'],
                        lty=[1, 1, 1, 2, 2, 2])
            #r('dev.off()')
            classificationErrorPlotStaticFile.closeRFigure()
            print classificationErrorPlotStaticFile.getLink(
                'View Type 1/2 error plot'
            ) + ' as function of number of true H1.', '<br>'

            #Classification errors
            onlyMaxAccuracy = [
                sum(errors) * 1.0 / totalNumTests
                for errors in zip(onlyMaxType1Errors, onlyMaxType2Errors)
            ]
            seqMcAccuracy = [
                sum(errors) * 1.0 / totalNumTests
                for errors in zip(seqMcType1Errors, seqMcType2Errors)
            ]
            mcFdrAccuracy = [
                sum(errors) * 1.0 / totalNumTests
                for errors in zip(mcFdrType1Errors, mcFdrType2Errors)
            ]

            accuracyPlotStaticFile = GalaxyRunSpecificFile(['accuracy.png'],
                                                           galaxyFn)
            accuracyPlotStaticFile.openRFigure()
            yMax = 0.2  #just set ad hoc here..
            #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72)
            r.plot(r.unlist(sortedKeys),
                   r.unlist(onlyMaxAccuracy),
                   ylim=r.unlist([0, yMax]),
                   type='l',
                   xlab='Number of true H1s',
                   ylab='Accuracy',
                   col='black')
            r.lines(r.unlist(sortedKeys), r.unlist(seqMcAccuracy), col='red')
            r.lines(r.unlist(sortedKeys), r.unlist(mcFdrAccuracy), col='green')
            rpy1.legend('topleft', ['BasicMc', 'SeqMc', 'McFdr', 'NumFromH1'],
                        col=['black', 'red', 'green'],
                        lty=[1, 1, 1])
            #r('dev.off()')
            accuracyPlotStaticFile.closeRFigure()
            print accuracyPlotStaticFile.getLink(
                'View accuracy plot'
            ) + ' as function of number of true H1.', '<br>'

            #False positive rates
            onlyMaxFpr = [
                float(fp) / pos if pos != 0 else 0
                for fp, pos in zip(onlyMaxType1Errors, onlyMaxNumRejected)
            ]
            seqMcFpr = [
                float(fp) / pos if pos != 0 else 0
                for fp, pos in zip(seqMcType1Errors, seqMcNumRejected)
            ]
            mcFdrFpr = [
                float(fp) / pos if pos != 0 else 0
                for fp, pos in zip(mcFdrType1Errors, mcFdrNumRejected)
            ]

            fprPlotStaticFile = GalaxyRunSpecificFile(['fpr.png'], galaxyFn)
            fprPlotStaticFile.plotRLines(sortedKeys,
                                         [onlyMaxFpr, seqMcFpr, mcFdrFpr],
                                         legend=['BasicMc', 'SeqMc', 'McFdr'])
            print fprPlotStaticFile.getLink(
                'View FPR plot') + ' as function of number of true H1.', '<br>'
Ejemplo n.º 3
0
    def execute(cls, choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.gtr
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''

        print 'temporarily overriding tool, running McFdr2 simulation..'
        from test.sandbox.extra.McFdr2 import analyzeSampleNumAccuracy
        for numSamples in [100, 1000, 10000]:
            print ''
            print 'numSamples %s: ' % numSamples,
            for i in range(3):
                print analyzeSampleNumAccuracy(numSamples),
        return

        from proto.RSetup import r
        from numpy import array, minimum
        pVal, minNumSamples, maxNumSamples, chunkSize, numTests = [
            float(x) for x in choices[:-1]
        ]
        print 'pVal:%.2f, minNumSamples:%i, maxNumSamples:%i, chunkSize:%i, numTests:%i' % (
            pVal, minNumSamples, maxNumSamples, chunkSize, numTests)

        assert (maxNumSamples - minNumSamples) % chunkSize == 0
        assert numTests == 1  #More not yet supported. Should in McFdr be something like the min-max, i.e. the minimum across iterations of the maximum p-value across tests..

        pValEstimation = choices[-1]
        assert pValEstimation in ['Davison', 'ML']
        if pValEstimation == 'Davison':
            pFunc = lambda k, n: 1.0 * (k + 1) / (n + 1)
        else:
            pFunc = lambda k, n: 1.0 * (k) / n

        numRepl = 10**4
        stdAtMin = [
            pFunc(k, minNumSamples)
            for k in r.rbinom(numRepl, minNumSamples, pVal)
        ]
        stdAtMax = [
            pFunc(k, maxNumSamples)
            for k in r.rbinom(numRepl, maxNumSamples, pVal)
        ]

        mcFdrBestPVals = array([1.0] * numRepl)
        mcFdrSamples = minNumSamples  #array([minNumSamples]*numRepl)
        mcFdrExtremes = array(r.rbinom(numRepl, minNumSamples, pVal))
        while mcFdrSamples < maxNumSamples:
            tempMcFdrPVals = pFunc(mcFdrExtremes, mcFdrSamples)
            mcFdrBestPVals = minimum(mcFdrBestPVals, tempMcFdrPVals)

            mcFdrSamples += chunkSize
            mcFdrExtremes += array(r.rbinom(numRepl, chunkSize, pVal))
        tempMcFdrPVals = pFunc(mcFdrExtremes, mcFdrSamples)
        mcFdrBestPVals = minimum(mcFdrBestPVals, tempMcFdrPVals)
        assert mcFdrSamples == maxNumSamples

        print 'Mean values<br>'
        print 'AtMin:%.7f, AtMax:%.7f, McFdr:%.7f' % tuple(
            [array(x).mean() for x in [stdAtMin, stdAtMax, mcFdrBestPVals]])

        breaks = [pVal * 2 * x / 100.0 for x in range(0, 101)] + [1.0]

        histRes = r.hist(stdAtMin, breaks=breaks, plot=False)
        xVals = histRes['mids']
        yValsStdAtMin = histRes['density']

        histRes = r.hist(stdAtMax, breaks=breaks, plot=False)
        assert xVals == histRes['mids']
        yValsStdAtMax = histRes['density']

        histRes = r.hist(mcFdrBestPVals, breaks=breaks, plot=False)
        assert xVals == histRes['mids']
        yValsMcFdr = histRes['density']

        staticFile = GalaxyRunSpecificFile(['pDistr.png'], galaxyFn)
        staticFile.openRFigure()
        staticFile.plotRLines(xVals,
                              [yValsStdAtMin, yValsStdAtMax, yValsMcFdr],
                              alsoOpenAndClose=False,
                              xlab='p-value',
                              ylab='density',
                              xlim=[0, 2 * pVal])
        r.abline(v=pVal, lty='dotted', col='yellow')
        staticFile.closeRFigure()
        print staticFile.getLink('View estimated pval distribution')
Ejemplo n.º 4
0
    def singleSimulation(self, numH0, numH1, replicateIndex, verbose=False):
        tests = MultipleTestCollection(numH0, numH1, self._maxNumSamples,
                                       self._h, self._fdrThreshold, self._a,
                                       self._b)
        tests.addSamples(self.NUM_SAMPLES_INITIALLY)
        while not tests.allTestsAreDetermined():
            tests.addSamples(self.NUM_SAMPLES_PER_CHUNK)
            #if verbose:
            #print tests.getTotalNumSamples()
        #As sampling is now anyway over, we set fdrThreshold to a threshold used after computations are finished (i.e. affects final rejection/acception, but not stopping of samples)
        tests.setFdrThresholdAtAllCounters(self._postFdrThreshold)

        #print 'FINALLY, #samples: ',
        if self._galaxyFn is not None:
            if self._h is None:
                scheme = 'Basic'
            elif self._fdrThreshold is None:
                scheme = 'Sequential'
            else:
                scheme = 'McFdr'
            staticFile = GalaxyRunSpecificFile(
                [scheme,
                 str(numH1),
                 str(replicateIndex), 'PandQvals.txt'], self._galaxyFn)
            tests.writeAllPandQVals(staticFile.getFile())
            linkToRaw = staticFile.getLink(
                'Raw p and q-vals'
            ) + ' under %s scheme with %i true H1, (replication %i)' % (
                scheme, numH1, replicateIndex)

            figStaticFile = GalaxyRunSpecificFile(
                [scheme,
                 str(numH1),
                 str(replicateIndex), 'PandQvals.png'], self._galaxyFn)
            figStaticFile.openRFigure()
            tests.makeAllPandQValsFigure()
            figStaticFile.closeRFigure()
            linkToFig = figStaticFile.getLink(' (p/q-figure) ') + '<br>'

            figNumSamplesStaticFile = GalaxyRunSpecificFile(
                [scheme,
                 str(numH1),
                 str(replicateIndex), 'NumSamples.png'], self._galaxyFn)
            figNumSamplesStaticFile.openRFigure()
            tests.makeNumSamplesFigure()
            figNumSamplesStaticFile.closeRFigure()
            linkToNumSamplesFig = figNumSamplesStaticFile.getLink(
                ' (numSamples-figure) ') + '<br>'

            catalogStaticFile = GalaxyRunSpecificFile([str(numH1), 'cat.html'],
                                                      self._galaxyFn)
            catalogStaticFile.writeTextToFile(linkToRaw + linkToFig +
                                              linkToNumSamplesFig,
                                              mode='a')

        #if verbose:
        #print sorted(tests.getFdrVals())
        #print 'NumS ign Below 0.2: ', sum([1 if t<0.2 else 0 for t in tests.getFdrVals()])
        #return tests.getTotalNumSamples(), tests.getTotalNumRejected()
        return tests.getTotalNumSamples(), tests.getTotalNumRejected(
        ), tests.getClassificationSummaries()