Ejemplo n.º 1
0
 def __init__(self, region, track, track2, rawStatistic, randTrackClass=None, assumptions=None, tails=None, numResamplings=2000, randomSeed=None, **kwArgs):
     if tails==None:
         if 'tail' in kwArgs:
             tailTranslator = {'more':'right-tail', 'less':'left-tail', 'different':'two-tail'}
             tails = tailTranslator[kwArgs['tail']]
             if DebugConfig.VERBOSE:
                 logMessage('Argument tail provided instead of tails to RandomizationManagerStatUnsplittable', level=logging.DEBUG)
         else:
             tails = 'right-tail' # or 'two-tail'?
             logMessage('No tails argument provided to RandomizationManagerStatUnsplittable', level=logging.DEBUG)
     
     if track2 is None:
         self._track2 = None #to allow track2 to be passed on as None to rawStatistics without error. For use by single-track MC-tests..
         
     from gold.util.RandomUtil import getManualSeed, setManualSeed
     if randomSeed is not None and randomSeed != 'Random' and getManualSeed() is None:
         setManualSeed(int(randomSeed))
     
     if 'mcSetupScheme' in kwArgs:
         kwArgs = copy(kwArgs) #to not edit original dict..
         if kwArgs['mcSetupScheme'] != 'custom':
             assert not 'maxSamples' in kwArgs #check that specific values are not redundantly set
         #
     Statistic.__init__(self, region, track, track2, rawStatistic=rawStatistic, randTrackClass=randTrackClass, assumptions=assumptions, tails=tails, numResamplings=numResamplings, randomSeed=randomSeed, **kwArgs)
     #if type(rawStatistic) is str:
     #    from gold.statistic.AllStatistics import STAT_CLASS_DICT
     #    rawStatistic = STAT_CLASS_DICT[rawStatistic]
     
     assert (randTrackClass is None) ^ (assumptions is None) # xor
     if assumptions is not None:
         assert assumptions.count('_') == 1, assumptions
         randTrackClass1, randTrackClass2 = assumptions.split('_')
     else:
         randTrackClass1 = None
         randTrackClass2 = randTrackClass
     
     self._randTrackClass1, self._randTrackClass2 = \
         [ ( globals()[clsDef] if clsDef not in ['None',''] else None ) \
             if isinstance(clsDef, basestring) else clsDef for clsDef in [randTrackClass1, randTrackClass2]]
     
     assert not (randTrackClass1 is None and randTrackClass2 is None)
     for cls in [self._randTrackClass1, self._randTrackClass2]:
         assert cls in [None, PermutedSegsAndSampledIntersegsTrack, \
                        PermutedSegsAndIntersegsTrack, RandomGenomeLocationTrack, SegsSampledByIntensityTrack, ShuffledMarksTrack, SegsSampledByDistanceToReferenceTrack, PointsSampledFromBinaryIntensityTrack]
         
     #print self._randTrackClass1, self._randTrackClass2
     self._rawStatistic = self.getRawStatisticClass(rawStatistic)
     
     #self._randTrackList = []
     self._tails = tails
     if kwArgs.get('minimal') == True:
         self._numResamplings = 1
         self._kwArgs['maxSamples'] = 1
     else:
         self._numResamplings = int(numResamplings)
     CompBinManager.ALLOW_COMP_BIN_SPLITTING = False
     self._randResults = []
     self._observation = None
     #to load r libraries for McFdr:
     McFdr._initMcFdr()
 def __init__(self, region, track, track2, rawStatistic, randTrackClass=None, assumptions=None, tails=None, numResamplings=2000, randomSeed=None, **kwArgs):
     #print 'TEMP RM:',kwArgs
     if tails==None:
         if 'tail' in kwArgs:
             tailTranslator = {'more':'right-tail', 'less':'left-tail', 'different':'two-tail'}
             tails = tailTranslator[kwArgs['tail']]
             if DebugConfig.VERBOSE:
                 logMessage('Argument tail provided instead of tails to RandomizationManagerStatUnsplittable', level=logging.DEBUG)
         else:
             tails = 'right-tail' # or 'two-tail'?
             logMessage('No tails argument provided to RandomizationManagerStatUnsplittable', level=logging.DEBUG)
     
     if track2 is None:
         self._track2 = None #to allow track2 to be passed on as None to rawStatistics without error. For use by single-track MC-tests..
         
     from gold.util.RandomUtil import getManualSeed, setManualSeed
     if randomSeed is not None and randomSeed != 'Random' and getManualSeed() is None:
         setManualSeed(int(randomSeed))
         
     Statistic.__init__(self, region, track, track2, rawStatistic=rawStatistic, randTrackClass=randTrackClass, assumptions=assumptions, tails=tails, numResamplings=numResamplings, randomSeed=randomSeed, **kwArgs)
     #if type(rawStatistic) is str:
     #    from gold.statistic.AllStatistics import STAT_CLASS_DICT
     #    rawStatistic = STAT_CLASS_DICT[rawStatistic]
     
     assert (randTrackClass is None) ^ (assumptions is None) # xor
     if assumptions is not None:
         assert assumptions.count('_') == 1, assumptions
         randTrackClass1, randTrackClass2 = assumptions.split('_')
     else:
         randTrackClass1 = None
         randTrackClass2 = randTrackClass
     
     self._randTrackClass1, self._randTrackClass2 = \
         [ ( globals()[clsDef] if clsDef not in ['None',''] else None ) \
             if type(clsDef) is str else clsDef for clsDef in [randTrackClass1, randTrackClass2] ]
     
     assert not (randTrackClass1 is None and randTrackClass2 is None)
     for cls in [self._randTrackClass1, self._randTrackClass2]:
         assert cls in [None, PermutedSegsAndSampledIntersegsTrack, \
                        PermutedSegsAndIntersegsTrack, RandomGenomeLocationTrack, SegsSampledByIntensityTrack, ShuffledMarksTrack]
         
     #print self._randTrackClass1, self._randTrackClass2
     self._rawStatistic = self.getRawStatisticClass(rawStatistic)
     
     #self._randTrackList = []
     self._tails = tails
     if kwArgs.get('minimal') == True:
         self._numResamplings = 1
         self._kwArgs['maxSamples'] = 1
     else:
         self._numResamplings = int(numResamplings)
     CompBinManager.ALLOW_COMP_BIN_SPLITTING = False
     self._randResults = []
     self._observation = None
     #to load r libraries for McFdr:
     McFdr._initMcFdr()
 def execute(choices, galaxyFn=None, username=''):
     '''Is called when execute-button is pushed by web-user.
     Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
     If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
     If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
     choices is a list of selections made by web-user in each options box.
     '''
     from quick.statistic.McFdr import McFdr
     McFdr._initMcFdr()
     qvalsSet = [McFdr.adjustPvalues([float(x) for x in pvals.replace('c(','').replace(')','').split(',') ]) for pvals in choices[0].split('\n') if pvals!='']
     for i,qvals in enumerate(qvalsSet):
         print 'qvals%i = c(' % (i+1) + ','.join(str(x) for x in qvals) + ')' 
Ejemplo n.º 4
0
 def execute(cls, choices, galaxyFn=None, username=''):
     '''Is called when execute-button is pushed by web-user.
     Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
     If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
     If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
     choices is a list of selections made by web-user in each options box.
     '''
     from quick.statistic.McFdr import McFdr
     
     McFdr._initMcFdr()
     qvalsSet = [McFdr.adjustPvalues([float(x) for x in pvals.replace('c(','').replace(')','').split(',') ]) for pvals in choices[0].split('\n') if pvals!='']
     for i,qvals in enumerate(qvalsSet):
         print 'qvals%i = c(' % (i+1) + ','.join(str(x) for x in qvals) + ')' 
Ejemplo n.º 5
0
    def validateAndPossiblyResetLocalResults(cls, localSamplingObjects):
        #to load r libraries for McFdr:
        McFdr._initMcFdr()

        numNonDetermined = 0
        pvals = [x._getPval() for x in localSamplingObjects]
        qValues = McFdr.adjustPvalues(pvals, verbose=False)
        #First decide whether any further sampling will be needed (whether numNonDetermined==0)
        for i in range(len(localSamplingObjects)):
            localSamplingObjects[i]._adjPValue = qValues[i]
            if not localSamplingObjects[i].isMcDetermined():
                numNonDetermined += 1

        #print 'TEMP3: ', numNonDetermined
        if numNonDetermined > 0:
            #Decide which will need further sampling (through deleting result), based on individualMcDetermined
            for i in range(len(localSamplingObjects)):
                if not localSamplingObjects[i].isIndividuallyMcDetermined():
                    del localSamplingObjects[i]._result
        return numNonDetermined
Ejemplo n.º 6
0
    def inferAdjustedPvalues(self):
        pValKey = self.getPvalKey()
        if pValKey is None or self.FDR_KEY in self.getResDictKeys():
            return

        regKeys = self.getAllRegionKeys()
        #regPVals = [ self[reg].get(pValKey) if (self[reg].get(pValKey) is not None) else numpy.nan for reg in regKeys]
        #
        #from gold.application.RSetup import r
        #regFdrVals = r('p.adjust')(r.unlist(regPVals), self.FDR_KEY)
        regPVals = [self[reg].get(pValKey) for reg in regKeys]
        from quick.statistic.McFdr import McFdr
        McFdr._initMcFdr()  #to load r libraries..
        regFdrVals = McFdr.adjustPvalues(regPVals, verbose=False)

        #if len(regPVals) == 1:
        #    regFdrVals = [regFdrVals]
        assert len(regFdrVals) == len(regKeys), 'fdr: ' + str(
            len(regFdrVals)) + ', regs: ' + str(len(regKeys))
        for i, reg in enumerate(regKeys):
            self[reg][self.FDR_KEY] = (regFdrVals[i] if regPVals[i] is not None
                                       else numpy.nan)

        if self._globalResult is None:
            keys = self.getResDictKeys()
            self._globalResult = OrderedDict(zip((keys), [None] * len(keys)))

        #self._globalResult[self.FDR_KEY] = self.getSignBinsText(regFdrVals, self.FDR_THRESHOLD)
        #if self._globalResult[pValKey] is None:
        #self._globalResult[pValKey] = self.getSignBinsText(regPVals, self.PVALUE_THRESHOLD)

        tempGlobalResult = self._globalResult
        self._globalResult = OrderedDict()

        self._globalResult.update([(pValKey, tempGlobalResult[pValKey])])
        self._globalResult.update([(self.FDR_KEY, None)])
        self._globalResult.update([(key, tempGlobalResult[key])
                                   for key in tempGlobalResult.keys()
                                   if key != pValKey])

        self._resDictKeys = None  #resetting..
Ejemplo n.º 7
0
    def inferAdjustedPvalues(self):
        pValKey = self.getPvalKey()                
        if pValKey is None or self.FDR_KEY in self.getResDictKeys():
            return

        from gold.application.RSetup import r
        
        regKeys = self.getAllRegionKeys()
        #regPVals = [ self[reg].get(pValKey) if (self[reg].get(pValKey) is not None) else numpy.nan for reg in regKeys]
        #
        #from gold.application.RSetup import r
        #regFdrVals = r('p.adjust')(r.unlist(regPVals), self.FDR_KEY)
        regPVals = [ self[reg].get(pValKey) for reg in regKeys]
        from quick.statistic.McFdr import McFdr
        McFdr._initMcFdr() #to load r libraries..
        regFdrVals = McFdr.adjustPvalues(regPVals, verbose=False)
        
        #if len(regPVals) == 1:
        #    regFdrVals = [regFdrVals]
        assert len(regFdrVals) == len(regKeys), 'fdr: ' + str(len(regFdrVals)) + ', regs: ' + str(len(regKeys))
        for i, reg in enumerate(regKeys):
            self[reg][self.FDR_KEY] = (regFdrVals[i] if regPVals[i] is not None else numpy.nan)
            
        
        if self._globalResult is None:
            keys = self.getResDictKeys()
            self._globalResult = OrderedDict(zip((keys), [None]*len(keys)))
        
        #self._globalResult[self.FDR_KEY] = self.getSignBinsText(regFdrVals, self.FDR_THRESHOLD)
        #if self._globalResult[pValKey] is None:
            #self._globalResult[pValKey] = self.getSignBinsText(regPVals, self.PVALUE_THRESHOLD)
        
        tempGlobalResult = self._globalResult
        self._globalResult = OrderedDict()
        
        self._globalResult.update([(pValKey, tempGlobalResult[pValKey])])    
        self._globalResult.update([(self.FDR_KEY, None)])
        self._globalResult.update([(key, tempGlobalResult[key]) for key in tempGlobalResult.keys() if key != pValKey])
        
        self._resDictKeys = None #resetting..
Ejemplo n.º 8
0
 def getFdrVals(self):
     adjustMethod = 'fdr'
     pvals = [counter.getPvalEstimate() for counter in self._sampleCounters]
     #r('library(pi0)') #assumed that this has been done before..
     #if self.ESTIMATE_PI0:
     #    if self.ESTIMATE_PI0 == 'Convest':
     #        pi0 = r.convest(pvals)
     #    elif self.ESTIMATE_PI0 == 'Histf1':
     #        pi0 = r.histf1(pvals)
     #    elif self.ESTIMATE_PI0 == 'Pounds&Cheng':
     #        pi0 = min(1.0, mean( [p for p in pvals if not numpy.isnan(p)] )*2.0)
     #        #r('histf1SeqPerm <- function(p) {histf1(p,seq.perm=TRUE)}')
     #        #pi0 = r.histf1SeqPerm(pvals)
     #    else:
     #        raise Exception('Unvalid self.ESTIMATE_PI0: ' + str(self.ESTIMATE_PI0))
     #else:
     #    pi0 = 1
     #fdrVals = r.fdr(pvals, pi0)
     fdrVals = McFdr.adjustPvalues(pvals, self.ESTIMATE_PI0, False)
     #fdrVals = r('p.adjust')(r.unlist(pvals), adjustMethod)
     if type(fdrVals) in [float, int]:
         fdrVals = [fdrVals]
     return fdrVals
Ejemplo n.º 9
0
 def getFdrVals(self):
     adjustMethod = 'fdr'
     pvals = [counter.getPvalEstimate() for counter in self._sampleCounters]
     #r('library(pi0)') #assumed that this has been done before..
     #if self.ESTIMATE_PI0:
     #    if self.ESTIMATE_PI0 == 'Convest':
     #        pi0 = r.convest(pvals)
     #    elif self.ESTIMATE_PI0 == 'Histf1':
     #        pi0 = r.histf1(pvals)
     #    elif self.ESTIMATE_PI0 == 'Pounds&Cheng':
     #        pi0 = min(1.0, mean( [p for p in pvals if not numpy.isnan(p)] )*2.0)                
     #        #r('histf1SeqPerm <- function(p) {histf1(p,seq.perm=TRUE)}')
     #        #pi0 = r.histf1SeqPerm(pvals)
     #    else:
     #        raise Exception('Unvalid self.ESTIMATE_PI0: ' + str(self.ESTIMATE_PI0))
     #else:
     #    pi0 = 1
     #fdrVals = r.fdr(pvals, pi0)
     fdrVals = McFdr.adjustPvalues(pvals, self.ESTIMATE_PI0, False)
     #fdrVals = r('p.adjust')(r.unlist(pvals), adjustMethod)
     if type(fdrVals) in [float,int]:
         fdrVals = [fdrVals]
     return fdrVals
    def validateAndPossiblyResetLocalResults(cls, stats):
        #return 0#to short-circuit this functionality as it is currently in development
        #return McFdr.dummyStub(stats)
        if len(stats)==0:
            return 0
        #else:
            #print 'LEN: ',len(stats)
        mt = stats[0]._kwArgs.get('mThreshold')
        ft = stats[0]._kwArgs.get('fdrThreshold')
        ms = stats[0]._kwArgs.get('maxSamples')
        fc = stats[0]._kwArgs.get('fdrCriterion')
        M_THRESHOLD = int(mt) if mt is not None else 20
        FDR_THRESHOLD = float(ft) if ft is not None else 0.1
        if ms is None:
            MAX_SAMPLES = 50000
        elif type(ms) is int:
            MAX_SAMPLES = ms
        elif ms.lower() == 'unlimited':
            MAX_SAMPLES = None
        else:
            MAX_SAMPLES = int(ms)
        
        #print 'M_THRESHOLD:%i, FDR_THRESHOLD:%.2f, MAX_SAMPLES:%s' % (M_THRESHOLD,FDR_THRESHOLD,str(MAX_SAMPLES))
        
        
        #print 'min samples:%i, samples per chunk:%i' % (stats[0]._numResamplings, NUM_SAMPLES_PER_CHUNK)
        
        assert fc in [None, 'individual','simultaneous'], 'fdrCriterion:'+str(fc)
        individualFdr = (fc == 'individual')
        #print 'FDR criterion: %s' % fc
        if fc is None:
            logMessage('Warning: empty fdrCriterion, using simultaneous')
        #USE_MC_FDR = True #if false, use only standard sequential MC, without checking q-values
        
        from gold.application.RSetup import r
        import numpy
        
        #print '<pre>'
        #pvals = [x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] for x in stats]
        pvals = range(len(stats))
        allMs = range(len(stats))
        allNumSamples = range(len(stats))
        isInValid = range(len(stats))
        for i,x in enumerate(stats):
            try:
                pvals[i] = x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY]
                allMs[i] = x.getResult()[RandomizationManagerStatUnsplittable.M_KEY]
                allNumSamples[i] = x.getResult()[RandomizationManagerStatUnsplittable.NUM_SAMPLES_KEY]
                isInValid[i] = False
            except:
                pvals[i] = None
                allMs[i] = None
                allNumSamples[i] = None
                isInValid[i] = True
        
        
        #print 'P: ',pvals
        #print 'Stats: ',stats
        #print 'LEN: ',len(stats)
        fdrVals = McFdr.adjustPvalues(pvals, verbose=False)
        
        #if not type(fdrVals) in (list,tuple):
        #    fdrVals = [fdrVals]
        #print 'FDR: ', fdrVals
        
        #allMs = [x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] for x in stats] #maybe just access stat object directly to get this..
        #allMs = range(len(stats))
        #for i,x in enumerate(stats):
        #    try:
        #        allMs[i] = x.getResult()[RandomizationManagerStatUnsplittable.M_KEY]
        #    except:
        #        allMs[i] = None
        
        #determinedByM = [M_THRESHOLD is not None and m is not None and m>=M_THRESHOLD for m in allMs]
        determinedByM = [M_THRESHOLD is not None and m>=M_THRESHOLD for m in allMs]
        determinedByFdr = [FDR_THRESHOLD is not None and not numpy.isnan(f) and f<FDR_THRESHOLD for f in fdrVals]
        determinedByMaxSamples = [MAX_SAMPLES is not None and n>=MAX_SAMPLES for n in allNumSamples]
        statIndividuallyDetermined = list(any(x) for x in zip(determinedByM,determinedByMaxSamples,isInValid)) #determined by anything except FDR, as the latter is not necessarily handled on a per test level..
        statDeterminedByAnyMeans = list(any(x) for x in zip(statIndividuallyDetermined, determinedByFdr)) #determined individually or by FDR
        assert len(stats) == len(pvals) == len(fdrVals) == len(allMs) == len(determinedByM) == len(determinedByFdr) == len(statIndividuallyDetermined)
        
        #print '</pre>'
        #print allMs
        #print fdrVals
        
        #ndIndexes = [i for i in range(len(statDetermined)) if not statDetermined[i]]
        #print 'INDEXES: ' + ','.join([str(x) for x in ndIndexes]), '<br>'
        #print 'M-VALUES: ' + ','.join([str(allMs[x]) for x in ndIndexes]), '<br>'
        #print 'P-VALUES: ' + ','.join([str(pvals[x]) for x in ndIndexes]), '<br>'
        #print 'FDR-VALUES: ' + ','.join([str(fdrVals[x]) for x in ndIndexes]), '<br>'

        for i in range(len(statIndividuallyDetermined)):
            determined = statIndividuallyDetermined[i] or (individualFdr and determinedByFdr[i])
            if not determined:
                if hasattr(stats[i], '_result'):
                    del stats[i]._result
                else:
                    print 'no _result to delete at index %i in stats: '%i #, stats
                    print 'obj details: ',stats[i]._region
                stats[i]._numResamplings += cls.NUM_SAMPLES_PER_CHUNK #get number from mcFdr..
        #return all(statDeterminedByAnyMeans)
        #returns number of not determined stats..
        return sum((1 if not determined else 0) for determined in statDeterminedByAnyMeans)
    def validateAndPossiblyResetLocalResults(cls, stats):
        if len(stats) == 0:
            return 0

        mt = stats[0]._kwArgs.get('mThreshold')
        ft = stats[0]._kwArgs.get('fdrThreshold')
        ms = stats[0]._kwArgs.get('maxSamples')
        fc = stats[0]._kwArgs.get('fdrCriterion')
        npc = stats[0]._kwArgs.get(cls.NUM_SAMPLES_PER_CHUNK_KEY)

        M_THRESHOLD = int(mt) if mt is not None else 20
        FDR_THRESHOLD = float(ft) if ft is not None else 0.1
        if ms is None:
            MAX_SAMPLES = 50000
        elif type(ms) is int:
            MAX_SAMPLES = ms
        elif ms.lower() == 'unlimited':
            MAX_SAMPLES = None
        else:
            MAX_SAMPLES = int(ms)
        cls.NUM_SAMPLES_PER_CHUNK = int(npc) if npc is not None else 100

        assert fc in [None, 'individual',
                      'simultaneous'], 'fdrCriterion:' + str(fc)
        individualFdr = (fc == 'individual')
        if fc is None:
            logMessage('Warning: empty fdrCriterion, using simultaneous')

        import numpy

        pvals = range(len(stats))
        allMs = range(len(stats))
        allNumSamples = range(len(stats))
        isInValid = range(len(stats))
        for i, x in enumerate(stats):
            try:
                pvals[i] = x.getResult()[
                    RandomizationManagerStatUnsplittable.PVAL_KEY]
                allMs[i] = x.getResult()[
                    RandomizationManagerStatUnsplittable.M_KEY]
                allNumSamples[i] = x.getResult()[
                    RandomizationManagerStatUnsplittable.NUM_SAMPLES_KEY]
                isInValid[i] = False
            except:
                pvals[i] = None
                allMs[i] = None
                allNumSamples[i] = None
                isInValid[i] = True

        from quick.statistic.McFdr import McFdr
        fdrVals = McFdr.adjustPvalues(pvals, verbose=False)

        determinedByM = [
            M_THRESHOLD is not None and m >= M_THRESHOLD for m in allMs
        ]
        determinedByFdr = [
            FDR_THRESHOLD is not None and not numpy.isnan(f)
            and f < FDR_THRESHOLD for f in fdrVals
        ]
        isNanValued = [f is not None and numpy.isnan(f) for f in fdrVals]
        determinedByMaxSamples = [
            MAX_SAMPLES is not None and n >= MAX_SAMPLES for n in allNumSamples
        ]
        statIndividuallyDetermined = list(
            any(x) for x in zip(determinedByM, determinedByMaxSamples,
                                isInValid, isNanValued)
        )  #determined by anything except FDR, as the latter is not necessarily handled on a per test level..
        statDeterminedByAnyMeans = list(
            any(x)
            for x in zip(statIndividuallyDetermined,
                         determinedByFdr))  #determined individually or by FDR
        assert len(stats) == len(pvals) == len(fdrVals) == len(allMs) == len(
            determinedByM) == len(determinedByFdr) == len(
                statIndividuallyDetermined)

        for i in range(len(statIndividuallyDetermined)):
            determined = statIndividuallyDetermined[i] or (individualFdr and
                                                           determinedByFdr[i])
            if not determined:
                if hasattr(stats[i], '_result'):
                    del stats[i]._result
                else:
                    print 'no _result to delete at index %i in stats: ' % i  #, stats
                    print 'obj details: ', stats[i]._region
                stats[
                    i]._numResamplings += cls.NUM_SAMPLES_PER_CHUNK  #get number from mcFdr..

        return sum((1 if not determined else 0)
                   for determined in statDeterminedByAnyMeans)