def calc_local_params(self, Data, LP=None, doLogElapsedTime=False, **kwargs): ''' Calculate local parameters specific to each data item. This is the E-step of the EM algorithm. ''' if LP is None: LP = dict() if doLogElapsedTime: ElapsedTimeLogger.startEvent('local', 'obsupdate') # Calculate "soft evidence" each component has for each item # Fills in LP['E_log_soft_ev'], N x K array LP = self.obsModel.calc_local_params(Data, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'obsupdate') ElapsedTimeLogger.startEvent('local', 'allocupdate') # Combine with allocModel probs of each cluster # Fills in LP['resp'], N x K array whose rows sum to one LP = self.allocModel.calc_local_params(Data, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'allocupdate') return LP
def load_batch_local_params_from_memory(self, batchID, doCopy=0): ''' Load local parameter dict stored in memory for provided batchID TODO: Fastforward so recent truncation changes are accounted for. Returns ------- batchLP : dict of local parameters specific to batchID ''' batchLP = self.LPmemory[batchID] if isinstance(batchLP, str): ElapsedTimeLogger.startEvent('io', 'loadlocal') batchLPpath = os.path.abspath(batchLP) assert os.path.exists(batchLPpath) F = np.load(batchLPpath, allow_pickle=True) indptr = np.arange(0, (F['D'] + 1) * F['nnzPerDoc'], F['nnzPerDoc']) batchLP = dict() batchLP['DocTopicCount'] = scipy.sparse.csr_matrix( (F['data'], F['indices'], indptr), shape=(F['D'], F['K'])).toarray() ElapsedTimeLogger.stopEvent('io', 'loadlocal') if doCopy: # Duplicating to avoid changing the raw data stored in LPmemory # Usually for debugging only batchLP = copy.deepcopy(batchLP) return batchLP
def eval_custom_func(self, isFinal=0, isInitial=0, lapFrac=0, **kwargs): ''' Evaluates a custom hook function ''' cFuncPath = self.outputParams['customFuncPath'] if cFuncPath is None or cFuncPath == 'None': return None cbName = str(cFuncPath) ElapsedTimeLogger.startEvent('callback', cbName) cFuncArgs_string = self.outputParams['customFuncArgs'] nLapTotal = self.algParams['nLap'] if isinstance(cFuncPath, str): cFuncPath = cFuncPath.replace(".py", "") pathParts = cFuncPath.split(os.path.sep) if len(pathParts) > 1: # Absolute path provided cFuncDir = os.path.expandvars(os.path.sep.join(pathParts[:-1])) sys.path.append(cFuncDir) cFuncModName = pathParts[-1] cFuncModule = __import__(cFuncModName, fromlist=[]) else: # Treat as relative path to file in bnpy.callbacks cFuncModule = __import__( 'bnpy.callbacks.', fromlist=[cFuncPath]) cFuncModule = getattr(cFuncModule, cFuncPath) else: cFuncModule = cFuncPath # directly passed in as object kwargs['nLap'] = self.algParams['nLap'] kwargs['lapFrac'] = lapFrac kwargs['isFinal'] = isFinal kwargs['isInitial'] = isInitial if isInitial: kwargs['lapFrac'] = 0 kwargs['iterid'] = 0 hasCBFuncs = hasattr(cFuncModule, 'onBatchComplete') or \ hasattr(cFuncModule, 'onLapComplete') or \ hasattr(cFuncModule, 'onAlgorithmComplete') if not hasCBFuncs: raise ValueError("Specified customFuncPath has no callbacks!") if hasattr(cFuncModule, 'onBatchComplete') and not isFinal: cFuncModule.onBatchComplete(args=cFuncArgs_string, **kwargs) if hasattr(cFuncModule, 'onLapComplete') \ and isEvenlyDivisibleFloat(lapFrac, 1.0) and not isFinal: cFuncModule.onLapComplete(args=cFuncArgs_string, **kwargs) if hasattr(cFuncModule, 'onAlgorithmComplete') \ and isFinal: cFuncModule.onAlgorithmComplete(args=cFuncArgs_string, **kwargs) ElapsedTimeLogger.stopEvent('callback', cbName)
def update_global_params(self, SS, rho=None, doLogElapsedTime=False, **kwargs): ''' Update (in-place) global parameters given provided suff stats. This is the M-step of EM. ''' if doLogElapsedTime: ElapsedTimeLogger.startEvent('global', 'alloc') self.allocModel.update_global_params(SS, rho, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('global', 'alloc') ElapsedTimeLogger.startEvent('global', 'obs') self.obsModel.update_global_params(SS, rho, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('global', 'obs')
def calc_evidence(self, Data=None, SS=None, LP=None, scaleFactor=None, todict=False, doLogElapsedTime=False, **kwargs): ''' Compute evidence lower bound (ELBO) objective function. ''' if doLogElapsedTime: ElapsedTimeLogger.startEvent('global', 'ev') if Data is not None and LP is None and SS is None: LP = self.calc_local_params(Data, **kwargs) SS = self.get_global_suff_stats(Data, LP) evA = self.allocModel.calc_evidence(Data, SS, LP, todict=todict, **kwargs) evObs = self.obsModel.calc_evidence(Data, SS, LP, todict=todict, **kwargs) if scaleFactor is None: if hasattr(SS, 'scaleFactor'): scaleFactor = SS.scaleFactor else: scaleFactor = self.obsModel.getDatasetScale(SS) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('global', 'ev') if todict: evA.update(evObs) for key in evA: evA[key] /= scaleFactor # Identify unique keys, ignoring subdivided terms # eg Lalloc_top_term1 and Lalloc_top_term2 are not counted, # since we expect they are already aggregated in term Lalloc ukeys = list(set([key.split('_')[0] for key in evA.keys()])) evA['Ltotal'] = sum([evA[key] for key in ukeys]) return evA else: return (evA + evObs) / scaleFactor
def get_global_suff_stats(self, Data, LP, doLogElapsedTime=False, **kwargs): ''' Calculate sufficient statistics for each component. These stats summarize the data and local parameters assigned to each component. This is necessary prep for the Global Step update. ''' if doLogElapsedTime: ElapsedTimeLogger.startEvent('local', 'allocsummary') SS = self.allocModel.get_global_suff_stats(Data, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'allocsummary') ElapsedTimeLogger.startEvent('local', 'obssummary') SS = self.obsModel.get_global_suff_stats(Data, SS, LP, **kwargs) if doLogElapsedTime: ElapsedTimeLogger.stopEvent('local', 'obssummary') return SS
def saveParams(self, lap, hmodel, SS=None, **kwargs): ''' Save current model to disk ''' if lap in self.SavedIters or self.task_output_path is None: return ElapsedTimeLogger.startEvent("io", "saveparams") self.SavedIters.add(lap) prefix = ModelWriter.makePrefixForLap(lap) with open(self.mkfile('snapshot_lap.txt'), 'a') as f: f.write(six.text_type('%.4f\n' % (lap))) with open(self.mkfile('snapshot_elapsed_time_sec.txt'), 'a') as f: f.write(six.text_type('%.3f\n' % (self.get_elapsed_time()))) if self.outputParams['doSaveFullModel']: ModelWriter.save_model( hmodel, self.task_output_path, prefix, doSavePriorInfo=np.allclose(lap, 0.0), doLinkBest=True, doSaveObsModel=self.outputParams['doSaveObsModel']) if self.outputParams['doSaveTopicModel']: ModelWriter.saveTopicModel( hmodel, SS, self.task_output_path, prefix, **kwargs) ElapsedTimeLogger.stopEvent("io", "saveparams")
def save_batch_local_params_to_memory(self, batchID, batchLP): ''' Store certain fields of the provided local parameters dict into "memory" for later retrieval. Fields to save determined by the memoLPkeys attribute of this alg. ''' batchLP = dict(**batchLP) # make a copy allkeys = list(batchLP.keys()) for key in allkeys: if key != 'DocTopicCount': del batchLP[key] if len(list(batchLP.keys())) > 0: if self.algParams['doMemoizeLocalParams'] == 1: self.LPmemory[batchID] = batchLP elif self.algParams['doMemoizeLocalParams'] == 2: ElapsedTimeLogger.startEvent('io', 'savelocal') spDTC = sparsifyResp(batchLP['DocTopicCount'], self.algParams['nnzPerDocForStorage']) wc_D = batchLP['DocTopicCount'].sum(axis=1) wc_U = np.repeat(wc_D, self.algParams['nnzPerDocForStorage']) spDTC.data *= wc_U savepath = self.savedir.replace(os.environ['BNPYOUTDIR'], '') if os.path.exists('/ltmp/'): savepath = '/ltmp/%s/' % (savepath) else: savepath = '/tmp/%s/' % (savepath) from distutils.dir_util import mkpath mkpath(savepath) savepath = os.path.join(savepath, 'batch%d.npz' % (batchID)) # Now actually save it! np.savez(savepath, data=spDTC.data, indices=spDTC.indices, D=spDTC.shape[0], K=spDTC.shape[1], nnzPerDoc=spDTC.indptr[1]) self.LPmemory[batchID] = savepath del batchLP del spDTC ElapsedTimeLogger.stopEvent('io', 'savelocal')