def run(self, printProgress=PRINT_PROGRESS): ''' Runs the statistic specified in self._analysis (from analysisDef) and returns an object of class Result ''' #Should be there for batch runs.. Should never happen from GUI.. if self._statClass == None: self._handleMissingStat() return None if USE_PROFILING: profiler = Profiler() resDict = {} profiler.run('resDict[0] = StatJob.run(self, printProgress=printProgress)', globals(), locals()) res = resDict[0] else: res = StatJob.run(self, printProgress=printProgress) res.setAnalysis(self._analysis) res.setAnalysisText(str(self._analysis)) ResultsMemoizer.flushStoredResults() if USE_PROFILING: profiler.printStats() return res
def getValidAnalysisDefFromTitle(cls, analysisTitle, genome, trackName1, trackName2): matchingAnalyses = [ analysis for key, analysis in cls.getAllAnalysisTuples() if ':' in key and key[0:key.find(':')] == analysisTitle ] validAnalyses = [] validReversedAnalyses = [] for analysis in matchingAnalyses: analysis, reversed = cls._tryAnalysisForValidity( analysis, genome, trackName1, trackName2) if analysis is not None: if reversed: validReversedAnalyses.append(analysis) else: validAnalyses.append(analysis) ResultsMemoizer.flushStoredResults() if len(validAnalyses) == 1: return validAnalyses[0].getDef() elif len(validReversedAnalyses) == 1: return validReversedAnalyses[0].getDef() # logMessage('No analysisDef chosen. validAnalyses: %s, validReversedAnalyses: %s' \ # % ([x.getDef() for x in validAnalyses], [x.getDef() for x in validReversedAnalyses])) return ''
def _compute(self): kwArgs = copy(self._kwArgs) if 'rawStatistic' in kwArgs: del kwArgs['rawStatistic'] matrixElRes = [] tr1Subtypes = ProcTrackOptions.getSubtypes(self.getGenome(), self._track.trackName, True) assert len(tr1Subtypes) > 0 for subtype1 in tr1Subtypes:#['0','1']: for subtype2 in ['0','1']: tn1 = self._track.trackName + [subtype1] tn2 = self._track2.trackName + [subtype2] if not os.path.exists(createDirPath(tn1,self.getGenome())) or not os.path.exists(createDirPath(tn2,self.getGenome())): raise IncompatibleTracksError #print ',' track1 = Track( tn1) track1.formatConverters = self._track.formatConverters track2 = Track( tn2) track2.formatConverters = self._track2.formatConverters #self._addChild(self._rawStatistic(self._region, track1, track2, **kwArgs) ) matrixElRes.append( self._rawStatistic(self._region, track1, track2, **kwArgs).getResult() ) ResultsMemoizer.flushStoredResults() #assert len(self._children) == 7 #return dict(zip( '00,01,10,11'.split(','), [x.getResult() for x in self._children[3:]])) allChildRes = array(matrixElRes) #allChildRes = array([x.getResult() for x in self._children[3:]]) allChildRes = allChildRes.reshape((-1,2)) return OrderedDict([('Matrix', allChildRes.tolist()), ('Rows', tr1Subtypes), ('Cols', ['Case','Control'])])
def run(self, printProgress=PRINT_PROGRESS): ''' Runs the statistic specified in self._analysis (from analysisDef) and returns an object of class Result ''' #Should be there for batch runs.. Should never happen from GUI.. if self._statClass == None: self._handleMissingStat() return None if DebugConfig.USE_PROFILING: from gold.util.Profiler import Profiler profiler = Profiler() resDict = {} profiler.run('resDict[0] = StatJob.run(self, printProgress=printProgress)', globals(), locals()) res = resDict[0] else: res = StatJob.run(self, printProgress=printProgress) res.setAnalysis(self._analysis) res.setAnalysisText(str(self._analysis)) ResultsMemoizer.flushStoredResults() if DebugConfig.USE_PROFILING: profiler.printStats() if DebugConfig.USE_CALLGRAPH and self._galaxyFn: profiler.printLinkToCallGraph(['profile_AnalysisDefJob'], self._galaxyFn) return res
def _loadMemoized(self): self.resultLoadedFromDisk = False try: ResultsMemoizer.loadResult(self) except IOError, e: logMessageOnce( 'No memoization due to IOError (probably because some other process are writing same data): ' + str(e))
def __iter__(self): job = self.job for bin in job._userBinSource: stat = job._statClass(bin, job._track, job._track2, **job._kwArgs) ResultsMemoizer.loadResult(stat) if hasattr(stat, "resultLoadedFromDisk") and stat.resultLoadedFromDisk: continue yield StatisticArgumentPickleWrapper(job._statClass, bin, job._track, job._track2, **job._kwArgs)
def __iter__(self): job = self.job for bin in job._userBinSource: stat = job._statClass(bin, job._track, job._track2, **job._kwArgs) ResultsMemoizer.loadResult(stat) if stat.resultLoadedFromDisk(): continue yield StatisticArgumentPickleWrapper(job._statClass, bin, job._track, job._track2, **job._kwArgs)
def _storeResult(self): try: ResultsMemoizer.storeResult(self) except IOError, e: #logging.getLogger(HB_LOGGER).debug('No memoization due to IOError (probably because some other process are reading/writing same data): ' + str(e)) logExceptionOnce( e, message= 'No memoization due to IOError (probably because some other process are reading/writing same data) ' )
def _compute(self): kwArgs = copy(self._kwArgs) if 'rawStatistic' in kwArgs: del kwArgs['rawStatistic'] matrixElRes = [] tr1Subtypes = ProcTrackOptions.getSubtypes(self.getGenome(), self._track.trackName, True) tr2Subtypes = ProcTrackOptions.getSubtypes(self.getGenome(), self._track2.trackName, True) assert len(tr1Subtypes) > 0, str(self._track.trackName) assert len(tr2Subtypes) > 0, str(self._track2.trackName) if 'minimal' in self._kwArgs: tr1Subtypes = tr1Subtypes[:1] tr2Subtypes = tr2Subtypes[:1] for subtype1 in tr1Subtypes: #['0','1']: #for subtype2 in ['0','1']: for subtype2 in tr2Subtypes: # print ',' tn1 = self._track.trackName + [subtype1] tn2 = self._track2.trackName + [subtype2] if not os.path.exists(createDirPath( tn1, self.getGenome())) or not os.path.exists( createDirPath(tn2, self.getGenome())): raise IncompatibleTracksError #print ',' track1 = Track(tn1) track1.formatConverters = self._track.formatConverters track2 = Track(tn2) track2.formatConverters = self._track2.formatConverters #self._addChild(self._rawStatistic(self._region, track1, track2, **kwArgs) ) matrixElRes.append( self._rawStatistic(self._region, track1, track2, **kwArgs).getResult()) ResultsMemoizer.flushStoredResults() #assert len(self._children) == 7 #return dict(zip( '00,01,10,11'.split(','), [x.getResult() for x in self._children[3:]])) allChildRes = array(matrixElRes) #allChildRes = array([x.getResult() for x in self._children[3:]]) allChildRes = allChildRes.reshape((len(tr1Subtypes), len(tr2Subtypes))) return { 'Result': OrderedDict([('Matrix', allChildRes.tolist()), ('Rows', tr1Subtypes), ('Cols', tr2Subtypes)]) }
def getValidAnalysesInCategory( category, genome, trackName1, trackName2): #, formatConverter1=None, formatConverter2=None): #print 'AnalysisManager: ',trackName1 validAnalyses = [] for analysis in AnalysisManager.getAnalysisDict()[category].values(): #from time import time #t = time() analysis, reversed = AnalysisManager._tryAnalysisForValidity( analysis, genome, trackName1, trackName2) #logMessage(analysisDef) #logMessage('%s' % (time() - t)) if analysis is not None: validAnalyses.append(analysis) ResultsMemoizer.flushStoredResults() return validAnalyses
def _getSingleResult(self, region): #print 'Kw Here: ', self._kwArgs, 'args here: ', self._args stat = self._statClass(region, self._trackStructure, *self._args, **self._kwArgs) try: res = stat.getResult() except (CentromerError, NoneResultError): res = None if DebugConfig.PASS_ON_NONERESULT_EXCEPTIONS: # @UndefinedVariable raise #if not isinstance(res, dict): if not getClassName(res) in ['dict', 'OrderedDict']: res = {} if res is None else {self.GENERAL_RESDICTKEY : res} #res = {self.GENERAL_RESDICTKEY : res} ResultsMemoizer.flushStoredResults() return res, stat
class StatJobV2(StatJob): def __init__(self, userBinSource, trackStructure, statClass, *args, **kwArgs): StatJob.USER_BIN_SOURCE = userBinSource #if 'userBins' in kwArgs: # logMessage('key "userBins" already found in kwArgs in StatJob.__init__') #else: # kwArgs['userBins'] = userBinSource self._userBinSource = userBinSource self._trackStructure = trackStructure self._statClass = statClass self._args = args self._kwArgs = kwArgs self._numUserBins = None @property def _track(self): if TrackStructure.QUERY_KEY not in self._trackStructure\ or not self._trackStructure.getQueryTrackList(): raise ShouldNotOccurError( 'Track structure must contain a query list of at least one track' ) return self._trackStructure.getQueryTrackList()[0] @property def _track2(self): if TrackStructure.REF_KEY in self._trackStructure\ and self._trackStructure.getReferenceTrackList(): return self._trackStructure.getReferenceTrackList()[0] return None # def _emptyResults(self): def _getSingleResult(self, region): stat = self._statClass(region, self._trackStructure, *self._args, **self._kwArgs) try: res = stat.getResult() except (CentromerError, NoneResultError), e: res = None if DebugConfig.PASS_ON_NONERESULT_EXCEPTIONS: raise #if not isinstance(res, dict): if not getClassName(res) in ['dict', 'OrderedDict']: res = {} if res is None else {self.GENERAL_RESDICTKEY: res} #res = {self.GENERAL_RESDICTKEY : res} ResultsMemoizer.flushStoredResults() return res, stat
class StatisticTaskWrapper(TaskWrapper): def __init__(self): TaskWrapper.__init__(self) self.GENERAL_RESDICTKEY = "Result" self.referenceKeeper = None def handleTask(self, task): stat = task.toStatistic() try: res = stat.getResult() except (CentromerError, NoneResultError),e: res = None if not isinstance(res, dict): res = {} if res is None else {self.GENERAL_RESDICTKEY : res} ResultsMemoizer.flushStoredResults() self.referenceKeeper = stat #keeps statistic from being garbage collected stat.afterComputeCleanup() return self._createResultsDictFromMemoDict()
def _loadMemoizedResult(self): self.resetResultLoadedFromDiskFlag() ResultsMemoizer.loadResult(self)
def _storeMemoizedResult(self): ResultsMemoizer.storeResult(self)
def _loadMinimalMemoizedResult(self): self.resetResultLoadedFromDiskFlag() return ResultsMemoizer.loadMinimalResult(self)
def _storeMinimalMemoizedResult(self): ResultsMemoizer.storeMinimalResult(self)
def _storeMinimalMemoizedError(self): ResultsMemoizer.storeMinimalError(self, sys.exc_info())
def _loadMemoized(self): self.resultLoadedFromDisk = False try: ResultsMemoizer.loadResult(self) except IOError, e: logMessageOnce('No memoization due to IOError (probably because some other process are writing same data): ' + str(e))
def run(self, printProgress=PRINT_PROGRESS): res = StatJob.run(self, printProgress=printProgress) ResultsMemoizer.flushStoredResults() return res
stat = self._statClass(region, self._trackStructure, *self._args, **self._kwArgs) try: res = stat.getResult() except (CentromerError, NoneResultError),e: res = None if DebugConfig.VERBOSE: logException(e, level=logging.DEBUG) if DebugConfig.PASS_ON_NONERESULT_EXCEPTIONS: raise #if not isinstance(res, dict): if not getClassName(res) in ['dict', 'OrderedDict']: res = {} if res is None else {self.GENERAL_RESDICTKEY : res} #res = {self.GENERAL_RESDICTKEY : res} ResultsMemoizer.flushStoredResults() return res, stat def getNumUserBins(self): if self._numUserBins is None: self._numUserBins = sum(1 for el in self._userBinSource) return self._numUserBins def _checkNumUserBinsIsValid(self): numUserBins = self.getNumUserBins() if numUserBins < 1: raise InvalidFormatError('Zero analysis bins specified.') #return False elif numUserBins > MAX_NUM_USER_BINS and not self._avoidUbStatMemoization(): raise InvalidFormatError('Maximum number of user bins exceeded - Maximum: '+str(MAX_NUM_USER_BINS)+ ', Requested: '+str(numUserBins)) #return False
def _storeResult(self): try: ResultsMemoizer.storeResult(self) except IOError, e: #logging.getLogger(HB_LOGGER).debug('No memoization due to IOError (probably because some other process are reading/writing same data): ' + str(e)) logExceptionOnce(e, message='No memoization due to IOError (probably because some other process are reading/writing same data) ')
def _getSingleResult(self, region): #print 'Kw Here: ', self._kwArgs stat = self._statClass(region, self._track, self._track2, *self._args, **self._kwArgs) try: res = stat.getResult() except (CentromerError, NoneResultError),e: res = None if DebugConfig.PASS_ON_NONERESULT_EXCEPTIONS: raise #if not isinstance(res, dict): if not getClassName(res) in ['dict', 'OrderedDict']: res = {} if res is None else {self.GENERAL_RESDICTKEY : res} #res = {self.GENERAL_RESDICTKEY : res} ResultsMemoizer.flushStoredResults() return res, stat def getNumUserBins(self): if self._numUserBins is None: self._numUserBins = sum(1 for el in self._userBinSource) return self._numUserBins def _checkNumUserBinsIsValid(self): numUserBins = self.getNumUserBins() if numUserBins < 1: raise InvalidFormatError('Zero analysis bins specified.') #return False elif numUserBins > MAX_NUM_USER_BINS and not self._avoidUbStatMemoization(): raise InvalidFormatError('Maximum number of user bins exceeded - Maximum: '+str(MAX_NUM_USER_BINS)+ ', Requested: '+str(numUserBins)) #return False
def run(self, printProgress=PRINT_PROGRESS): res = StatJob.run(self, printProgress) ResultsMemoizer.flushStoredResults() return res
class StatJob(object): GENERAL_RESDICTKEY = 'Result' USER_BIN_SOURCE = None #@takes(StatJob, UserBinSource, Track, Track, Statistic) #statClass will typically be a functools.partial object def __init__(self, userBinSource, track, track2, statClass, *args, **kwArgs): #Not relevant, as minimal runs are anyway done #if StatJob.USER_BIN_SOURCE != None: #logMessage('USER_BIN_SOURCE already set in StatJob') StatJob.USER_BIN_SOURCE = userBinSource #if 'userBins' in kwArgs: # logMessage('key "userBins" already found in kwArgs in StatJob.__init__') #else: # kwArgs['userBins'] = userBinSource self._userBinSource = userBinSource self._track = track self._track2 = track2 self._statClass = statClass self._args = args self._kwArgs = kwArgs self._numUserBins = None def _initProgress(self, printProgress): if hasattr(self._statClass, 'keywords'): #since kwArgs to Statistic usually has been wrapped in by functools.partial. statKwArgs = self._statClass.keywords else: statKwArgs = self._kwArgs from quick.statistic.McFdrSamplingStat import McFdrSamplingStat from quick.statistic.SequentialMcSamplingStat import SequentialMcSamplingStat #if self._kwArgs.get('minimal') == True or statKwArgs.get('silentProgress') == 'yes': #minimal is in kwArgs to StatJob if self._kwArgs.get( 'minimal') == True: #minimal is in kwArgs to StatJob progressClass = SilentProgress #elif self._kwArgs.get('numResamplings') < self._kwArgs.get('maxSamples'): #elif self._statClass.keywords.get('numResamplings') < self._statClass.keywords.get('maxSamples'): #since kwArgs to Statistic has been wrapped in by functools.partial. elif statKwArgs.get('mcSamplerClass') in [ 'McFdrSamplingStat', McFdrSamplingStat ]: progressClass = McFdrProgress elif statKwArgs.get('mcSamplerClass') in [ 'SequentialMcSamplingStat', SequentialMcSamplingStat ]: progressClass = SequentialMcProgress elif RandomizationManagerStat.getMcSamplingScheme( statKwArgs) == 'Sequential MC': progressClass = SequentialMcProgress elif RandomizationManagerStat.getMcSamplingScheme( statKwArgs) == 'MCFDR': progressClass = McFdrProgress else: #print 'KWARGS: ',self._kwArgs, self._args progressClass = StandardProgress #self._progress = progressClass(self.getNumUserBins(), printProgress, description=\ # '<p><b>Analyzing ' + str(self._track.trackName) + \ # (' vs ' + str(self._track2.trackName) if self._track2 is not None else '') + ' using statistic: ' + \ # self._statClass.__name__ + '</b><br><br> Performing local analysis: <br>') if hasattr(self, '_analysis'): nspi = self._analysis.getChoice('numSamplesPerChunk') else: nspi = self._kwArgs.get('numSamplesPerChunk') self._progress = progressClass(self.getNumUserBins(), printProgress, description=\ '<b>Analyzing ' + str(self._track.trackName) + \ (' vs ' + str(self._track2.trackName) if self._track2 is not None else '') + ' using statistic: ' + \ self._statClass.__name__ + '</b>\n\nPerforming local analysis:',\ numSamplesPerIteration=nspi) self._progress.addCount(0) def run(self, reset=True, printProgress=PRINT_PROGRESS, flushMemoized=True): if reset: MagicStatFactory.resetMemoDict() results = self._emptyResults() try: self._checkNumUserBinsIsValid() except Exception, e: results.addError(e) return results self._initProgress(printProgress) if USE_PARALLEL and not ('minimal' in self._kwArgs and self._kwArgs['minimal']): from quick.application.parallel.JobHandler import JobHandler from quick.application.parallel.JobWrapper import JobWrapperFactory jobWrapper = JobWrapperFactory.makeJobWrapper(self) #If we are using another statistic wrapper (i.e. monte carlo), the parallelization happens #"further down" due to how the HB works and we do not do anything here if jobWrapper.__class__.__name__ == "StatisticJobWrapper": if 'uniqueId' in self._kwArgs: uniqueId = self._kwArgs["uniqueId"] else: uniqueId = datetime.datetime.now().strftime( '%Y%m%d-%H%M%S%f') jobHandler = JobHandler(uniqueId, True, restrictions=[]) jobHandler.run(jobWrapper) startLocal = time.time() try: while True: stats = self._doLocalAnalysis(results, stats=[]) #stats[0] is used to call class method if self._kwArgs.get('minimal') == True: break numNotDetermined = stats[ 0].validateAndPossiblyResetLocalResults( stats) if len(stats) > 0 else 0 self._progress.addFullLocalAnalysisIteration(numNotDetermined) if len(stats) == 0 or numNotDetermined == 0: break #print 'Continuing McFdr' localAnalysisTime = time.time() - startLocal if USE_PARALLEL and not ('minimal' in self._kwArgs and self._kwArgs['minimal']): print "local analysis took %f seconds" % localAnalysisTime #startGlobal = time.time() #import pdb #pdb.set_trace() self._progress.globalAnalysisStarted() self._progress.printMessage('\nPerforming global analysis...') while True: stat = self._doGlobalAnalysis(results, stats) if stat is None: break nonDetermined, mValue, mThreshold, pValue, pThreshold = stat.validateAndPossiblyResetGlobalResult( stat) self._progress.addGlobalAnalysisIteration( mValue, mThreshold, pValue, pThreshold) if nonDetermined == 0: break finally: if flushMemoized: ResultsMemoizer.flushStoredResults() self._progress.globalAnalysisEnded() #print "<br>global analysis took %f seconds" % (time.time() - startGlobal) self._endProgress() return results