def load_batch_local_params_from_memory(self, batchID, doCopy=0): ''' Load local parameter dict stored in memory for provided batchID TODO: Fastforward so recent truncation changes are accounted for. Returns ------- batchLP : dict of local parameters specific to batchID ''' batchLP = self.LPmemory[batchID] if isinstance(batchLP, str): ElapsedTimeLogger.startEvent('io', 'loadlocal') batchLPpath = os.path.abspath(batchLP) assert os.path.exists(batchLPpath) F = np.load(batchLPpath, allow_pickle=True) indptr = np.arange(0, (F['D'] + 1) * F['nnzPerDoc'], F['nnzPerDoc']) batchLP = dict() batchLP['DocTopicCount'] = scipy.sparse.csr_matrix( (F['data'], F['indices'], indptr), shape=(F['D'], F['K'])).toarray() ElapsedTimeLogger.stopEvent('io', 'loadlocal') if doCopy: # Duplicating to avoid changing the raw data stored in LPmemory # Usually for debugging only batchLP = copy.deepcopy(batchLP) return batchLP
def eval_custom_func(self, isFinal=0, isInitial=0, lapFrac=0, **kwargs): ''' Evaluates a custom hook function ''' cFuncPath = self.outputParams['customFuncPath'] if cFuncPath is None or cFuncPath == 'None': return None cbName = str(cFuncPath) ElapsedTimeLogger.startEvent('callback', cbName) cFuncArgs_string = self.outputParams['customFuncArgs'] nLapTotal = self.algParams['nLap'] if isinstance(cFuncPath, str): cFuncPath = cFuncPath.replace(".py", "") pathParts = cFuncPath.split(os.path.sep) if len(pathParts) > 1: # Absolute path provided cFuncDir = os.path.expandvars(os.path.sep.join(pathParts[:-1])) sys.path.append(cFuncDir) cFuncModName = pathParts[-1] cFuncModule = __import__(cFuncModName, fromlist=[]) else: # Treat as relative path to file in bnpy.callbacks cFuncModule = __import__( 'bnpy.callbacks.', fromlist=[cFuncPath]) cFuncModule = getattr(cFuncModule, cFuncPath) else: cFuncModule = cFuncPath # directly passed in as object kwargs['nLap'] = self.algParams['nLap'] kwargs['lapFrac'] = lapFrac kwargs['isFinal'] = isFinal kwargs['isInitial'] = isInitial if isInitial: kwargs['lapFrac'] = 0 kwargs['iterid'] = 0 hasCBFuncs = hasattr(cFuncModule, 'onBatchComplete') or \ hasattr(cFuncModule, 'onLapComplete') or \ hasattr(cFuncModule, 'onAlgorithmComplete') if not hasCBFuncs: raise ValueError("Specified customFuncPath has no callbacks!") if hasattr(cFuncModule, 'onBatchComplete') and not isFinal: cFuncModule.onBatchComplete(args=cFuncArgs_string, **kwargs) if hasattr(cFuncModule, 'onLapComplete') \ and isEvenlyDivisibleFloat(lapFrac, 1.0) and not isFinal: cFuncModule.onLapComplete(args=cFuncArgs_string, **kwargs) if hasattr(cFuncModule, 'onAlgorithmComplete') \ and isFinal: cFuncModule.onAlgorithmComplete(args=cFuncArgs_string, **kwargs) ElapsedTimeLogger.stopEvent('callback', cbName)
def calc_local_params(self, Data, LP=None, doLogElapsedTime=False, **kwargs): ''' Calculate local parameters specific to each data item. This is the E-step of the EM algorithm. ''' if LP is None: LP = dict() if doLogElapsedTime: ElapsedTimeLogger.startEvent('local', 'obsupdate') # Calculate "soft evidence" each component has for each item # Fills in LP['E_log_soft_ev'], N x K array LP = self.obsModel.calc_local_params(Data, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'obsupdate') ElapsedTimeLogger.startEvent('local', 'allocupdate') # Combine with allocModel probs of each cluster # Fills in LP['resp'], N x K array whose rows sum to one LP = self.allocModel.calc_local_params(Data, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'allocupdate') return LP
def calc_evidence(self, Data=None, SS=None, LP=None, scaleFactor=None, todict=False, doLogElapsedTime=False, **kwargs): ''' Compute evidence lower bound (ELBO) objective function. ''' if doLogElapsedTime: ElapsedTimeLogger.startEvent('global', 'ev') if Data is not None and LP is None and SS is None: LP = self.calc_local_params(Data, **kwargs) SS = self.get_global_suff_stats(Data, LP) evA = self.allocModel.calc_evidence(Data, SS, LP, todict=todict, **kwargs) evObs = self.obsModel.calc_evidence(Data, SS, LP, todict=todict, **kwargs) if scaleFactor is None: if hasattr(SS, 'scaleFactor'): scaleFactor = SS.scaleFactor else: scaleFactor = self.obsModel.getDatasetScale(SS) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('global', 'ev') if todict: evA.update(evObs) for key in evA: evA[key] /= scaleFactor # Identify unique keys, ignoring subdivided terms # eg Lalloc_top_term1 and Lalloc_top_term2 are not counted, # since we expect they are already aggregated in term Lalloc ukeys = list(set([key.split('_')[0] for key in evA.keys()])) evA['Ltotal'] = sum([evA[key] for key in ukeys]) return evA else: return (evA + evObs) / scaleFactor
def saveParams(self, lap, hmodel, SS=None, **kwargs): ''' Save current model to disk ''' if lap in self.SavedIters or self.task_output_path is None: return ElapsedTimeLogger.startEvent("io", "saveparams") self.SavedIters.add(lap) prefix = ModelWriter.makePrefixForLap(lap) with open(self.mkfile('snapshot_lap.txt'), 'a') as f: f.write(six.text_type('%.4f\n' % (lap))) with open(self.mkfile('snapshot_elapsed_time_sec.txt'), 'a') as f: f.write(six.text_type('%.3f\n' % (self.get_elapsed_time()))) if self.outputParams['doSaveFullModel']: ModelWriter.save_model( hmodel, self.task_output_path, prefix, doSavePriorInfo=np.allclose(lap, 0.0), doLinkBest=True, doSaveObsModel=self.outputParams['doSaveObsModel']) if self.outputParams['doSaveTopicModel']: ModelWriter.saveTopicModel( hmodel, SS, self.task_output_path, prefix, **kwargs) ElapsedTimeLogger.stopEvent("io", "saveparams")
def save_batch_local_params_to_memory(self, batchID, batchLP): ''' Store certain fields of the provided local parameters dict into "memory" for later retrieval. Fields to save determined by the memoLPkeys attribute of this alg. ''' batchLP = dict(**batchLP) # make a copy allkeys = list(batchLP.keys()) for key in allkeys: if key != 'DocTopicCount': del batchLP[key] if len(list(batchLP.keys())) > 0: if self.algParams['doMemoizeLocalParams'] == 1: self.LPmemory[batchID] = batchLP elif self.algParams['doMemoizeLocalParams'] == 2: ElapsedTimeLogger.startEvent('io', 'savelocal') spDTC = sparsifyResp(batchLP['DocTopicCount'], self.algParams['nnzPerDocForStorage']) wc_D = batchLP['DocTopicCount'].sum(axis=1) wc_U = np.repeat(wc_D, self.algParams['nnzPerDocForStorage']) spDTC.data *= wc_U savepath = self.savedir.replace(os.environ['BNPYOUTDIR'], '') if os.path.exists('/ltmp/'): savepath = '/ltmp/%s/' % (savepath) else: savepath = '/tmp/%s/' % (savepath) from distutils.dir_util import mkpath mkpath(savepath) savepath = os.path.join(savepath, 'batch%d.npz' % (batchID)) # Now actually save it! np.savez(savepath, data=spDTC.data, indices=spDTC.indices, D=spDTC.shape[0], K=spDTC.shape[1], nnzPerDoc=spDTC.indptr[1]) self.LPmemory[batchID] = savepath del batchLP del spDTC ElapsedTimeLogger.stopEvent('io', 'savelocal')
def update_global_params(self, SS, rho=None, doLogElapsedTime=False, **kwargs): ''' Update (in-place) global parameters given provided suff stats. This is the M-step of EM. ''' if doLogElapsedTime: ElapsedTimeLogger.startEvent('global', 'alloc') self.allocModel.update_global_params(SS, rho, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('global', 'alloc') ElapsedTimeLogger.startEvent('global', 'obs') self.obsModel.update_global_params(SS, rho, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('global', 'obs')
def get_global_suff_stats(self, Data, LP, doLogElapsedTime=False, **kwargs): ''' Calculate sufficient statistics for each component. These stats summarize the data and local parameters assigned to each component. This is necessary prep for the Global Step update. ''' if doLogElapsedTime: ElapsedTimeLogger.startEvent('local', 'allocsummary') SS = self.allocModel.get_global_suff_stats(Data, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'allocsummary') ElapsedTimeLogger.startEvent('local', 'obssummary') SS = self.obsModel.get_global_suff_stats(Data, SS, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'obssummary') return SS
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs, dataName, allocModelName, obsModelName, algName, doSaveToDisk, doWriteStdOut): """ Internal method (should never be called by end-user!) Executes learning for a particular job and particular taskid. Returns ------- hmodel : bnpy HModel, fit to the data LP : Local parameter (LP) dict for the specific dataset RunInfo : dict of information about the run, with fields - 'loss' : final loss value for algorithm - 'loss_history' : vector of loss values over time """ # Make shallow copies of input dicts, so we any modifications here # do not return to the caller. ReqArgs = dict(**ReqArgs) KwArgs = dict(**KwArgs) UnkArgs = dict(**UnkArgs) algseed = createUniqueRandomSeed(jobname, taskID=taskid) dataorderseed = createUniqueRandomSeed('', taskID=taskid) KwArgs[algName]['algseed'] = algseed KwArgs[algName]['dataorderseed'] = dataorderseed if algName in OnlineDataAlgSet: KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap'] if isinstance(dataName, str): if os.path.exists(dataName): # dataName is a path to many data files on disk Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs, KwArgs, dataorderseed) DataArgs = UnkArgs # Set the short name for this dataset, # so that the filepath for results is informative. if not hasattr(Data, 'name'): try: Data.name = KwArgs['OnlineDataPrefs']['datasetName'] except KeyError: Data.name = 'UnknownDatasetName' else: DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs) Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed) else: Data = dataName InitData = dataName DataArgs = dict() assert isinstance(Data, bnpy.data.DataObj) if algName in OnlineDataAlgSet: OnlineDataArgs = KwArgs['OnlineDataPrefs'] OnlineDataArgs['dataorderseed'] = dataorderseed DataArgs = getKwArgsForLoadData(Data, UnkArgs) OnlineDataArgs.update(DataArgs) # add custom args Data = Data.to_iterator(**OnlineDataArgs) if hasattr(Data, 'name'): ReqArgs['dataName'] = Data.name if doSaveToDisk: task_output_path = make_task_output_path(ReqArgs, KwArgs, taskID=taskid) createEmptyOutputPathOnDisk(task_output_path) writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs) else: task_output_path = None KwArgs['OutputPrefs']['task_output_path'] = task_output_path jobID = configLoggingToConsoleAndFile(task_output_path, taskid, doSaveToDisk, doWriteStdOut) # Write descriptions to the log if taskid == 1 or jobID > 0: # Warn user about any unknown keyword arguments showWarningForUnknownArgs(UnkArgs, DataArgs) Log.info('Dataset Summary:') Log.info(Data.get_text_summary()) Log.info(Data.get_stats_summary()) # Create and initialize model parameters hmodel = make_initialized_model( InitData, seed=algseed, taskid=taskid, allocModelName=ReqArgs['allocModelName'], obsModelName=ReqArgs['obsModelName'], algName=ReqArgs['algName'], KwArgs=KwArgs, verbose=(taskid == 1 or jobID > 0), ) # Create learning algorithm learnAlg = createLearnAlg(Data, hmodel, ReqArgs, KwArgs, algseed=algseed, task_output_path=task_output_path) if learnAlg.hasMove('birth'): import bnpy.birthmove.BLogger as BirthLogger BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('delete'): import bnpy.deletemove.DLogger as DeleteLogger DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('merge'): import bnpy.mergemove.MLogger as MergeLogger MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('shuffle'): import bnpy.mergemove.SLogger as SLogger SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if str(type(hmodel.allocModel)).count('TopicModel'): import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger LocalStepLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) # Set up logging for how long each step of the alg takes. import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'], doSaveToDisk, doWriteStdOut) Log.info( 'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' % (algName, taskid, nTask, algseed, dataorderseed)) Log.info('task_output_path: %s' % (task_output_path)) # Fit the model to the data! RunInfo = learnAlg.fit(hmodel, Data) RunInfo['UnkArgs'] = UnkArgs RunInfo['KwArgs'] = KwArgs RunInfo['ReqArgs'] = ReqArgs return hmodel, RunInfo
def fit(self, hmodel, DataIterator, SS=None): ''' Run stochastic variational to fit hmodel parameters to Data. Returns -------- Info : dict of run information. Post Condition -------- hmodel updated in place with improved global parameters. ''' self.set_start_time_now() LP = None rho = 1.0 # Learning rate nBatch = float(DataIterator.nBatch) # Set-up progress-tracking variables iterid = -1 lapFrac = np.maximum(0, self.algParams['startLap'] - 1.0 / nBatch) if lapFrac > 0: # When restarting an existing run, # need to start with last update for final batch from previous lap DataIterator.lapID = int(np.ceil(lapFrac)) - 1 DataIterator.curLapPos = nBatch - 2 iterid = int(nBatch * lapFrac) - 1 # Save initial state self.saveParams(lapFrac, hmodel) # Custom func hook self.eval_custom_func(isInitial=1, **makeDictOfAllWorkspaceVars(**vars())) ElapsedTimeLogger.writeToLogOnLapCompleted(lapFrac) if self.algParams['doMemoELBO']: SStotal = None SSPerBatch = dict() else: loss_running_sum = 0 loss_per_batch = np.zeros(nBatch) while DataIterator.has_next_batch(): # Grab new data Dchunk = DataIterator.get_next_batch() batchID = DataIterator.batchID Dchunk.batchID = batchID # Update progress-tracking variables iterid += 1 lapFrac += 1.0 / nBatch self.lapFrac = lapFrac nLapsCompleted = lapFrac - self.algParams['startLap'] self.set_random_seed_at_lap(lapFrac) # E step self.algParamsLP['batchID'] = batchID self.algParamsLP['lapFrac'] = lapFrac # logging if batchID in self.LPmemory: batchLP = self.load_batch_local_params_from_memory(batchID) else: batchLP = None LP = hmodel.calc_local_params(Dchunk, batchLP, doLogElapsedTime=True, **self.algParamsLP) rho = (1 + iterid + self.rhodelay)**(-1.0 * self.rhoexp) if self.algParams['doMemoELBO']: # SS step. Scale at size of current batch. SS = hmodel.get_global_suff_stats(Dchunk, LP, doLogElapsedTime=True, doPrecompEntropy=True) if self.algParams['doMemoizeLocalParams']: self.save_batch_local_params_to_memory(batchID, LP) # Incremental updates for whole-dataset stats # Must happen before applification. if batchID in SSPerBatch: SStotal -= SSPerBatch[batchID] if SStotal is None: SStotal = SS.copy() else: SStotal += SS SSPerBatch[batchID] = SS.copy() # Scale up to size of whole dataset. if hasattr(Dchunk, 'nDoc'): ampF = Dchunk.nDocTotal / float(Dchunk.nDoc) SS.applyAmpFactor(ampF) else: ampF = Dchunk.nObsTotal / float(Dchunk.nObs) SS.applyAmpFactor(ampF) # M step with learning rate hmodel.update_global_params(SS, rho, doLogElapsedTime=True) # ELBO step assert not SStotal.hasAmpFactor() loss = -1 * hmodel.calc_evidence( SS=SStotal, doLogElapsedTime=True, afterGlobalStep=not self.algParams['useSlackTermsInELBO']) else: # SS step. Scale at size of current batch. SS = hmodel.get_global_suff_stats(Dchunk, LP, doLogElapsedTime=True) # Scale up to size of whole dataset. if hasattr(Dchunk, 'nDoc'): ampF = Dchunk.nDocTotal / float(Dchunk.nDoc) SS.applyAmpFactor(ampF) else: ampF = Dchunk.nObsTotal / float(Dchunk.nObs) SS.applyAmpFactor(ampF) # M step with learning rate hmodel.update_global_params(SS, rho, doLogElapsedTime=True) # ELBO step assert SS.hasAmpFactor() cur_batch_loss = -1 * hmodel.calc_evidence( Dchunk, SS, LP, doLogElapsedTime=True) if loss_per_batch[batchID] != 0: loss_running_sum -= loss_per_batch[batchID] loss_running_sum += cur_batch_loss loss_per_batch[batchID] = cur_batch_loss loss = loss_running_sum / nBatch # Display progress self.updateNumDataProcessed(Dchunk.get_size()) if self.isLogCheckpoint(lapFrac, iterid): self.printStateToLog(hmodel, loss, lapFrac, iterid, rho=rho) # Save diagnostics and params if self.isSaveDiagnosticsCheckpoint(lapFrac, iterid): self.saveDiagnostics(lapFrac, SS, loss) if self.isSaveParamsCheckpoint(lapFrac, iterid): self.saveParams(lapFrac, hmodel, tryToSparsifyOutput=1) # don't save SS here, since its for one batch only self.eval_custom_func(**makeDictOfAllWorkspaceVars(**vars())) if self.isLastBatch(lapFrac): ElapsedTimeLogger.writeToLogOnLapCompleted(lapFrac) # .... end loop over data # Finished! Save, print and exit self.printStateToLog(hmodel, loss, lapFrac, iterid, isFinal=1) self.saveParams(lapFrac, hmodel, SS) self.eval_custom_func(isFinal=1, **makeDictOfAllWorkspaceVars(**vars())) return self.buildRunInfo(Data=DataIterator, loss=loss, SS=SS)