def __init__(self, region, track, track2, rawStatistic, randTrackClass=None, assumptions=None, tails=None, numResamplings=2000, randomSeed=None, **kwArgs): if tails==None: if 'tail' in kwArgs: tailTranslator = {'more':'right-tail', 'less':'left-tail', 'different':'two-tail'} tails = tailTranslator[kwArgs['tail']] if DebugConfig.VERBOSE: logMessage('Argument tail provided instead of tails to RandomizationManagerStatUnsplittable', level=logging.DEBUG) else: tails = 'right-tail' # or 'two-tail'? logMessage('No tails argument provided to RandomizationManagerStatUnsplittable', level=logging.DEBUG) if track2 is None: self._track2 = None #to allow track2 to be passed on as None to rawStatistics without error. For use by single-track MC-tests.. from gold.util.RandomUtil import getManualSeed, setManualSeed if randomSeed is not None and randomSeed != 'Random' and getManualSeed() is None: setManualSeed(int(randomSeed)) if 'mcSetupScheme' in kwArgs: kwArgs = copy(kwArgs) #to not edit original dict.. if kwArgs['mcSetupScheme'] != 'custom': assert not 'maxSamples' in kwArgs #check that specific values are not redundantly set # Statistic.__init__(self, region, track, track2, rawStatistic=rawStatistic, randTrackClass=randTrackClass, assumptions=assumptions, tails=tails, numResamplings=numResamplings, randomSeed=randomSeed, **kwArgs) #if type(rawStatistic) is str: # from gold.statistic.AllStatistics import STAT_CLASS_DICT # rawStatistic = STAT_CLASS_DICT[rawStatistic] assert (randTrackClass is None) ^ (assumptions is None) # xor if assumptions is not None: assert assumptions.count('_') == 1, assumptions randTrackClass1, randTrackClass2 = assumptions.split('_') else: randTrackClass1 = None randTrackClass2 = randTrackClass self._randTrackClass1, self._randTrackClass2 = \ [ ( globals()[clsDef] if clsDef not in ['None',''] else None ) \ if isinstance(clsDef, basestring) else clsDef for clsDef in [randTrackClass1, randTrackClass2]] assert not (randTrackClass1 is None and randTrackClass2 is None) for cls in [self._randTrackClass1, self._randTrackClass2]: assert cls in [None, PermutedSegsAndSampledIntersegsTrack, \ PermutedSegsAndIntersegsTrack, RandomGenomeLocationTrack, SegsSampledByIntensityTrack, ShuffledMarksTrack, SegsSampledByDistanceToReferenceTrack, PointsSampledFromBinaryIntensityTrack] #print self._randTrackClass1, self._randTrackClass2 self._rawStatistic = self.getRawStatisticClass(rawStatistic) #self._randTrackList = [] self._tails = tails if kwArgs.get('minimal') == True: self._numResamplings = 1 self._kwArgs['maxSamples'] = 1 else: self._numResamplings = int(numResamplings) CompBinManager.ALLOW_COMP_BIN_SPLITTING = False self._randResults = [] self._observation = None #to load r libraries for McFdr: McFdr._initMcFdr()
def __init__(self, region, track, track2, rawStatistic, randTrackClass=None, assumptions=None, tails=None, numResamplings=2000, randomSeed=None, **kwArgs): #print 'TEMP RM:',kwArgs if tails==None: if 'tail' in kwArgs: tailTranslator = {'more':'right-tail', 'less':'left-tail', 'different':'two-tail'} tails = tailTranslator[kwArgs['tail']] if DebugConfig.VERBOSE: logMessage('Argument tail provided instead of tails to RandomizationManagerStatUnsplittable', level=logging.DEBUG) else: tails = 'right-tail' # or 'two-tail'? logMessage('No tails argument provided to RandomizationManagerStatUnsplittable', level=logging.DEBUG) if track2 is None: self._track2 = None #to allow track2 to be passed on as None to rawStatistics without error. For use by single-track MC-tests.. from gold.util.RandomUtil import getManualSeed, setManualSeed if randomSeed is not None and randomSeed != 'Random' and getManualSeed() is None: setManualSeed(int(randomSeed)) Statistic.__init__(self, region, track, track2, rawStatistic=rawStatistic, randTrackClass=randTrackClass, assumptions=assumptions, tails=tails, numResamplings=numResamplings, randomSeed=randomSeed, **kwArgs) #if type(rawStatistic) is str: # from gold.statistic.AllStatistics import STAT_CLASS_DICT # rawStatistic = STAT_CLASS_DICT[rawStatistic] assert (randTrackClass is None) ^ (assumptions is None) # xor if assumptions is not None: assert assumptions.count('_') == 1, assumptions randTrackClass1, randTrackClass2 = assumptions.split('_') else: randTrackClass1 = None randTrackClass2 = randTrackClass self._randTrackClass1, self._randTrackClass2 = \ [ ( globals()[clsDef] if clsDef not in ['None',''] else None ) \ if type(clsDef) is str else clsDef for clsDef in [randTrackClass1, randTrackClass2] ] assert not (randTrackClass1 is None and randTrackClass2 is None) for cls in [self._randTrackClass1, self._randTrackClass2]: assert cls in [None, PermutedSegsAndSampledIntersegsTrack, \ PermutedSegsAndIntersegsTrack, RandomGenomeLocationTrack, SegsSampledByIntensityTrack, ShuffledMarksTrack] #print self._randTrackClass1, self._randTrackClass2 self._rawStatistic = self.getRawStatisticClass(rawStatistic) #self._randTrackList = [] self._tails = tails if kwArgs.get('minimal') == True: self._numResamplings = 1 self._kwArgs['maxSamples'] = 1 else: self._numResamplings = int(numResamplings) CompBinManager.ALLOW_COMP_BIN_SPLITTING = False self._randResults = [] self._observation = None #to load r libraries for McFdr: McFdr._initMcFdr()
def execute(choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from quick.statistic.McFdr import McFdr McFdr._initMcFdr() qvalsSet = [McFdr.adjustPvalues([float(x) for x in pvals.replace('c(','').replace(')','').split(',') ]) for pvals in choices[0].split('\n') if pvals!=''] for i,qvals in enumerate(qvalsSet): print 'qvals%i = c(' % (i+1) + ','.join(str(x) for x in qvals) + ')'
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from quick.statistic.McFdr import McFdr McFdr._initMcFdr() qvalsSet = [McFdr.adjustPvalues([float(x) for x in pvals.replace('c(','').replace(')','').split(',') ]) for pvals in choices[0].split('\n') if pvals!=''] for i,qvals in enumerate(qvalsSet): print 'qvals%i = c(' % (i+1) + ','.join(str(x) for x in qvals) + ')'
def validateAndPossiblyResetLocalResults(cls, localSamplingObjects): #to load r libraries for McFdr: McFdr._initMcFdr() numNonDetermined = 0 pvals = [x._getPval() for x in localSamplingObjects] qValues = McFdr.adjustPvalues(pvals, verbose=False) #First decide whether any further sampling will be needed (whether numNonDetermined==0) for i in range(len(localSamplingObjects)): localSamplingObjects[i]._adjPValue = qValues[i] if not localSamplingObjects[i].isMcDetermined(): numNonDetermined += 1 #print 'TEMP3: ', numNonDetermined if numNonDetermined > 0: #Decide which will need further sampling (through deleting result), based on individualMcDetermined for i in range(len(localSamplingObjects)): if not localSamplingObjects[i].isIndividuallyMcDetermined(): del localSamplingObjects[i]._result return numNonDetermined
def inferAdjustedPvalues(self): pValKey = self.getPvalKey() if pValKey is None or self.FDR_KEY in self.getResDictKeys(): return regKeys = self.getAllRegionKeys() #regPVals = [ self[reg].get(pValKey) if (self[reg].get(pValKey) is not None) else numpy.nan for reg in regKeys] # #from gold.application.RSetup import r #regFdrVals = r('p.adjust')(r.unlist(regPVals), self.FDR_KEY) regPVals = [self[reg].get(pValKey) for reg in regKeys] from quick.statistic.McFdr import McFdr McFdr._initMcFdr() #to load r libraries.. regFdrVals = McFdr.adjustPvalues(regPVals, verbose=False) #if len(regPVals) == 1: # regFdrVals = [regFdrVals] assert len(regFdrVals) == len(regKeys), 'fdr: ' + str( len(regFdrVals)) + ', regs: ' + str(len(regKeys)) for i, reg in enumerate(regKeys): self[reg][self.FDR_KEY] = (regFdrVals[i] if regPVals[i] is not None else numpy.nan) if self._globalResult is None: keys = self.getResDictKeys() self._globalResult = OrderedDict(zip((keys), [None] * len(keys))) #self._globalResult[self.FDR_KEY] = self.getSignBinsText(regFdrVals, self.FDR_THRESHOLD) #if self._globalResult[pValKey] is None: #self._globalResult[pValKey] = self.getSignBinsText(regPVals, self.PVALUE_THRESHOLD) tempGlobalResult = self._globalResult self._globalResult = OrderedDict() self._globalResult.update([(pValKey, tempGlobalResult[pValKey])]) self._globalResult.update([(self.FDR_KEY, None)]) self._globalResult.update([(key, tempGlobalResult[key]) for key in tempGlobalResult.keys() if key != pValKey]) self._resDictKeys = None #resetting..
def inferAdjustedPvalues(self): pValKey = self.getPvalKey() if pValKey is None or self.FDR_KEY in self.getResDictKeys(): return from gold.application.RSetup import r regKeys = self.getAllRegionKeys() #regPVals = [ self[reg].get(pValKey) if (self[reg].get(pValKey) is not None) else numpy.nan for reg in regKeys] # #from gold.application.RSetup import r #regFdrVals = r('p.adjust')(r.unlist(regPVals), self.FDR_KEY) regPVals = [ self[reg].get(pValKey) for reg in regKeys] from quick.statistic.McFdr import McFdr McFdr._initMcFdr() #to load r libraries.. regFdrVals = McFdr.adjustPvalues(regPVals, verbose=False) #if len(regPVals) == 1: # regFdrVals = [regFdrVals] assert len(regFdrVals) == len(regKeys), 'fdr: ' + str(len(regFdrVals)) + ', regs: ' + str(len(regKeys)) for i, reg in enumerate(regKeys): self[reg][self.FDR_KEY] = (regFdrVals[i] if regPVals[i] is not None else numpy.nan) if self._globalResult is None: keys = self.getResDictKeys() self._globalResult = OrderedDict(zip((keys), [None]*len(keys))) #self._globalResult[self.FDR_KEY] = self.getSignBinsText(regFdrVals, self.FDR_THRESHOLD) #if self._globalResult[pValKey] is None: #self._globalResult[pValKey] = self.getSignBinsText(regPVals, self.PVALUE_THRESHOLD) tempGlobalResult = self._globalResult self._globalResult = OrderedDict() self._globalResult.update([(pValKey, tempGlobalResult[pValKey])]) self._globalResult.update([(self.FDR_KEY, None)]) self._globalResult.update([(key, tempGlobalResult[key]) for key in tempGlobalResult.keys() if key != pValKey]) self._resDictKeys = None #resetting..
def getFdrVals(self): adjustMethod = 'fdr' pvals = [counter.getPvalEstimate() for counter in self._sampleCounters] #r('library(pi0)') #assumed that this has been done before.. #if self.ESTIMATE_PI0: # if self.ESTIMATE_PI0 == 'Convest': # pi0 = r.convest(pvals) # elif self.ESTIMATE_PI0 == 'Histf1': # pi0 = r.histf1(pvals) # elif self.ESTIMATE_PI0 == 'Pounds&Cheng': # pi0 = min(1.0, mean( [p for p in pvals if not numpy.isnan(p)] )*2.0) # #r('histf1SeqPerm <- function(p) {histf1(p,seq.perm=TRUE)}') # #pi0 = r.histf1SeqPerm(pvals) # else: # raise Exception('Unvalid self.ESTIMATE_PI0: ' + str(self.ESTIMATE_PI0)) #else: # pi0 = 1 #fdrVals = r.fdr(pvals, pi0) fdrVals = McFdr.adjustPvalues(pvals, self.ESTIMATE_PI0, False) #fdrVals = r('p.adjust')(r.unlist(pvals), adjustMethod) if type(fdrVals) in [float, int]: fdrVals = [fdrVals] return fdrVals
def getFdrVals(self): adjustMethod = 'fdr' pvals = [counter.getPvalEstimate() for counter in self._sampleCounters] #r('library(pi0)') #assumed that this has been done before.. #if self.ESTIMATE_PI0: # if self.ESTIMATE_PI0 == 'Convest': # pi0 = r.convest(pvals) # elif self.ESTIMATE_PI0 == 'Histf1': # pi0 = r.histf1(pvals) # elif self.ESTIMATE_PI0 == 'Pounds&Cheng': # pi0 = min(1.0, mean( [p for p in pvals if not numpy.isnan(p)] )*2.0) # #r('histf1SeqPerm <- function(p) {histf1(p,seq.perm=TRUE)}') # #pi0 = r.histf1SeqPerm(pvals) # else: # raise Exception('Unvalid self.ESTIMATE_PI0: ' + str(self.ESTIMATE_PI0)) #else: # pi0 = 1 #fdrVals = r.fdr(pvals, pi0) fdrVals = McFdr.adjustPvalues(pvals, self.ESTIMATE_PI0, False) #fdrVals = r('p.adjust')(r.unlist(pvals), adjustMethod) if type(fdrVals) in [float,int]: fdrVals = [fdrVals] return fdrVals
def validateAndPossiblyResetLocalResults(cls, stats): #return 0#to short-circuit this functionality as it is currently in development #return McFdr.dummyStub(stats) if len(stats)==0: return 0 #else: #print 'LEN: ',len(stats) mt = stats[0]._kwArgs.get('mThreshold') ft = stats[0]._kwArgs.get('fdrThreshold') ms = stats[0]._kwArgs.get('maxSamples') fc = stats[0]._kwArgs.get('fdrCriterion') M_THRESHOLD = int(mt) if mt is not None else 20 FDR_THRESHOLD = float(ft) if ft is not None else 0.1 if ms is None: MAX_SAMPLES = 50000 elif type(ms) is int: MAX_SAMPLES = ms elif ms.lower() == 'unlimited': MAX_SAMPLES = None else: MAX_SAMPLES = int(ms) #print 'M_THRESHOLD:%i, FDR_THRESHOLD:%.2f, MAX_SAMPLES:%s' % (M_THRESHOLD,FDR_THRESHOLD,str(MAX_SAMPLES)) #print 'min samples:%i, samples per chunk:%i' % (stats[0]._numResamplings, NUM_SAMPLES_PER_CHUNK) assert fc in [None, 'individual','simultaneous'], 'fdrCriterion:'+str(fc) individualFdr = (fc == 'individual') #print 'FDR criterion: %s' % fc if fc is None: logMessage('Warning: empty fdrCriterion, using simultaneous') #USE_MC_FDR = True #if false, use only standard sequential MC, without checking q-values from gold.application.RSetup import r import numpy #print '<pre>' #pvals = [x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] for x in stats] pvals = range(len(stats)) allMs = range(len(stats)) allNumSamples = range(len(stats)) isInValid = range(len(stats)) for i,x in enumerate(stats): try: pvals[i] = x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] allMs[i] = x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] allNumSamples[i] = x.getResult()[RandomizationManagerStatUnsplittable.NUM_SAMPLES_KEY] isInValid[i] = False except: pvals[i] = None allMs[i] = None allNumSamples[i] = None isInValid[i] = True #print 'P: ',pvals #print 'Stats: ',stats #print 'LEN: ',len(stats) fdrVals = McFdr.adjustPvalues(pvals, verbose=False) #if not type(fdrVals) in (list,tuple): # fdrVals = [fdrVals] #print 'FDR: ', fdrVals #allMs = [x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] for x in stats] #maybe just access stat object directly to get this.. #allMs = range(len(stats)) #for i,x in enumerate(stats): # try: # allMs[i] = x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] # except: # allMs[i] = None #determinedByM = [M_THRESHOLD is not None and m is not None and m>=M_THRESHOLD for m in allMs] determinedByM = [M_THRESHOLD is not None and m>=M_THRESHOLD for m in allMs] determinedByFdr = [FDR_THRESHOLD is not None and not numpy.isnan(f) and f<FDR_THRESHOLD for f in fdrVals] determinedByMaxSamples = [MAX_SAMPLES is not None and n>=MAX_SAMPLES for n in allNumSamples] statIndividuallyDetermined = list(any(x) for x in zip(determinedByM,determinedByMaxSamples,isInValid)) #determined by anything except FDR, as the latter is not necessarily handled on a per test level.. statDeterminedByAnyMeans = list(any(x) for x in zip(statIndividuallyDetermined, determinedByFdr)) #determined individually or by FDR assert len(stats) == len(pvals) == len(fdrVals) == len(allMs) == len(determinedByM) == len(determinedByFdr) == len(statIndividuallyDetermined) #print '</pre>' #print allMs #print fdrVals #ndIndexes = [i for i in range(len(statDetermined)) if not statDetermined[i]] #print 'INDEXES: ' + ','.join([str(x) for x in ndIndexes]), '<br>' #print 'M-VALUES: ' + ','.join([str(allMs[x]) for x in ndIndexes]), '<br>' #print 'P-VALUES: ' + ','.join([str(pvals[x]) for x in ndIndexes]), '<br>' #print 'FDR-VALUES: ' + ','.join([str(fdrVals[x]) for x in ndIndexes]), '<br>' for i in range(len(statIndividuallyDetermined)): determined = statIndividuallyDetermined[i] or (individualFdr and determinedByFdr[i]) if not determined: if hasattr(stats[i], '_result'): del stats[i]._result else: print 'no _result to delete at index %i in stats: '%i #, stats print 'obj details: ',stats[i]._region stats[i]._numResamplings += cls.NUM_SAMPLES_PER_CHUNK #get number from mcFdr.. #return all(statDeterminedByAnyMeans) #returns number of not determined stats.. return sum((1 if not determined else 0) for determined in statDeterminedByAnyMeans)
def validateAndPossiblyResetLocalResults(cls, stats): if len(stats) == 0: return 0 mt = stats[0]._kwArgs.get('mThreshold') ft = stats[0]._kwArgs.get('fdrThreshold') ms = stats[0]._kwArgs.get('maxSamples') fc = stats[0]._kwArgs.get('fdrCriterion') npc = stats[0]._kwArgs.get(cls.NUM_SAMPLES_PER_CHUNK_KEY) M_THRESHOLD = int(mt) if mt is not None else 20 FDR_THRESHOLD = float(ft) if ft is not None else 0.1 if ms is None: MAX_SAMPLES = 50000 elif type(ms) is int: MAX_SAMPLES = ms elif ms.lower() == 'unlimited': MAX_SAMPLES = None else: MAX_SAMPLES = int(ms) cls.NUM_SAMPLES_PER_CHUNK = int(npc) if npc is not None else 100 assert fc in [None, 'individual', 'simultaneous'], 'fdrCriterion:' + str(fc) individualFdr = (fc == 'individual') if fc is None: logMessage('Warning: empty fdrCriterion, using simultaneous') import numpy pvals = range(len(stats)) allMs = range(len(stats)) allNumSamples = range(len(stats)) isInValid = range(len(stats)) for i, x in enumerate(stats): try: pvals[i] = x.getResult()[ RandomizationManagerStatUnsplittable.PVAL_KEY] allMs[i] = x.getResult()[ RandomizationManagerStatUnsplittable.M_KEY] allNumSamples[i] = x.getResult()[ RandomizationManagerStatUnsplittable.NUM_SAMPLES_KEY] isInValid[i] = False except: pvals[i] = None allMs[i] = None allNumSamples[i] = None isInValid[i] = True from quick.statistic.McFdr import McFdr fdrVals = McFdr.adjustPvalues(pvals, verbose=False) determinedByM = [ M_THRESHOLD is not None and m >= M_THRESHOLD for m in allMs ] determinedByFdr = [ FDR_THRESHOLD is not None and not numpy.isnan(f) and f < FDR_THRESHOLD for f in fdrVals ] isNanValued = [f is not None and numpy.isnan(f) for f in fdrVals] determinedByMaxSamples = [ MAX_SAMPLES is not None and n >= MAX_SAMPLES for n in allNumSamples ] statIndividuallyDetermined = list( any(x) for x in zip(determinedByM, determinedByMaxSamples, isInValid, isNanValued) ) #determined by anything except FDR, as the latter is not necessarily handled on a per test level.. statDeterminedByAnyMeans = list( any(x) for x in zip(statIndividuallyDetermined, determinedByFdr)) #determined individually or by FDR assert len(stats) == len(pvals) == len(fdrVals) == len(allMs) == len( determinedByM) == len(determinedByFdr) == len( statIndividuallyDetermined) for i in range(len(statIndividuallyDetermined)): determined = statIndividuallyDetermined[i] or (individualFdr and determinedByFdr[i]) if not determined: if hasattr(stats[i], '_result'): del stats[i]._result else: print 'no _result to delete at index %i in stats: ' % i #, stats print 'obj details: ', stats[i]._region stats[ i]._numResamplings += cls.NUM_SAMPLES_PER_CHUNK #get number from mcFdr.. return sum((1 if not determined else 0) for determined in statDeterminedByAnyMeans)