Beispiel #1
0
    def load_batch_local_params_from_memory(self, batchID, doCopy=0):
        ''' Load local parameter dict stored in memory for provided batchID

        TODO: Fastforward so recent truncation changes are accounted for.

        Returns
        -------
        batchLP : dict of local parameters specific to batchID
        '''
        batchLP = self.LPmemory[batchID]
        if isinstance(batchLP, str):
            ElapsedTimeLogger.startEvent('io', 'loadlocal')
            batchLPpath = os.path.abspath(batchLP)
            assert os.path.exists(batchLPpath)
            F = np.load(batchLPpath, allow_pickle=True)
            indptr = np.arange(0, (F['D'] + 1) * F['nnzPerDoc'],
                               F['nnzPerDoc'])
            batchLP = dict()
            batchLP['DocTopicCount'] = scipy.sparse.csr_matrix(
                (F['data'], F['indices'], indptr),
                shape=(F['D'], F['K'])).toarray()
            ElapsedTimeLogger.stopEvent('io', 'loadlocal')
        if doCopy:
            # Duplicating to avoid changing the raw data stored in LPmemory
            # Usually for debugging only
            batchLP = copy.deepcopy(batchLP)
        return batchLP
Beispiel #2
0
    def eval_custom_func(self, isFinal=0, isInitial=0, lapFrac=0, **kwargs):
        ''' Evaluates a custom hook function
        '''

        cFuncPath = self.outputParams['customFuncPath']
        if cFuncPath is None or cFuncPath == 'None':
            return None

        cbName = str(cFuncPath)
        ElapsedTimeLogger.startEvent('callback', cbName)

        cFuncArgs_string = self.outputParams['customFuncArgs']
        nLapTotal = self.algParams['nLap']
        if isinstance(cFuncPath, str):
            cFuncPath = cFuncPath.replace(".py", "")
            pathParts = cFuncPath.split(os.path.sep)
            if len(pathParts) > 1:
                # Absolute path provided
                cFuncDir = os.path.expandvars(os.path.sep.join(pathParts[:-1]))
                sys.path.append(cFuncDir)
                cFuncModName = pathParts[-1]
                cFuncModule = __import__(cFuncModName, fromlist=[])
            else:
                # Treat as relative path to file in bnpy.callbacks
                cFuncModule = __import__(
                    'bnpy.callbacks.', fromlist=[cFuncPath])
                cFuncModule = getattr(cFuncModule, cFuncPath)
        else:
            cFuncModule = cFuncPath  # directly passed in as object

        kwargs['nLap'] = self.algParams['nLap']
        kwargs['lapFrac'] = lapFrac
        kwargs['isFinal'] = isFinal
        kwargs['isInitial'] = isInitial
        if isInitial:
            kwargs['lapFrac'] = 0
            kwargs['iterid'] = 0

        hasCBFuncs = hasattr(cFuncModule, 'onBatchComplete') or \
            hasattr(cFuncModule, 'onLapComplete') or \
            hasattr(cFuncModule, 'onAlgorithmComplete')
        if not hasCBFuncs:
            raise ValueError("Specified customFuncPath has no callbacks!")
        if hasattr(cFuncModule, 'onBatchComplete') and not isFinal:
            cFuncModule.onBatchComplete(args=cFuncArgs_string, **kwargs)
        if hasattr(cFuncModule, 'onLapComplete') \
           and isEvenlyDivisibleFloat(lapFrac, 1.0) and not isFinal:
            cFuncModule.onLapComplete(args=cFuncArgs_string, **kwargs)
        if hasattr(cFuncModule, 'onAlgorithmComplete') \
           and isFinal:
            cFuncModule.onAlgorithmComplete(args=cFuncArgs_string, **kwargs)
        ElapsedTimeLogger.stopEvent('callback', cbName)
Beispiel #3
0
    def calc_local_params(self,
                          Data,
                          LP=None,
                          doLogElapsedTime=False,
                          **kwargs):
        ''' Calculate local parameters specific to each data item.

            This is the E-step of the EM algorithm.
        '''
        if LP is None:
            LP = dict()
        if doLogElapsedTime:
            ElapsedTimeLogger.startEvent('local', 'obsupdate')
        # Calculate  "soft evidence" each component has for each item
        # Fills in LP['E_log_soft_ev'], N x K array
        LP = self.obsModel.calc_local_params(Data, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'obsupdate')
            ElapsedTimeLogger.startEvent('local', 'allocupdate')
        # Combine with allocModel probs of each cluster
        # Fills in LP['resp'], N x K array whose rows sum to one
        LP = self.allocModel.calc_local_params(Data, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'allocupdate')
        return LP
Beispiel #4
0
    def calc_evidence(self,
                      Data=None,
                      SS=None,
                      LP=None,
                      scaleFactor=None,
                      todict=False,
                      doLogElapsedTime=False,
                      **kwargs):
        ''' Compute evidence lower bound (ELBO) objective function.
        '''
        if doLogElapsedTime:
            ElapsedTimeLogger.startEvent('global', 'ev')

        if Data is not None and LP is None and SS is None:
            LP = self.calc_local_params(Data, **kwargs)
            SS = self.get_global_suff_stats(Data, LP)
        evA = self.allocModel.calc_evidence(Data,
                                            SS,
                                            LP,
                                            todict=todict,
                                            **kwargs)
        evObs = self.obsModel.calc_evidence(Data,
                                            SS,
                                            LP,
                                            todict=todict,
                                            **kwargs)
        if scaleFactor is None:
            if hasattr(SS, 'scaleFactor'):
                scaleFactor = SS.scaleFactor
            else:
                scaleFactor = self.obsModel.getDatasetScale(SS)

        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('global', 'ev')

        if todict:
            evA.update(evObs)
            for key in evA:
                evA[key] /= scaleFactor
            # Identify unique keys, ignoring subdivided terms
            # eg Lalloc_top_term1 and Lalloc_top_term2 are not counted,
            # since we expect they are already aggregated in term Lalloc
            ukeys = list(set([key.split('_')[0] for key in evA.keys()]))
            evA['Ltotal'] = sum([evA[key] for key in ukeys])
            return evA
        else:
            return (evA + evObs) / scaleFactor
Beispiel #5
0
 def saveParams(self, lap, hmodel, SS=None, **kwargs):
     ''' Save current model to disk
     '''
     if lap in self.SavedIters or self.task_output_path is None:
         return
     ElapsedTimeLogger.startEvent("io", "saveparams")
     self.SavedIters.add(lap)
     prefix = ModelWriter.makePrefixForLap(lap)
     with open(self.mkfile('snapshot_lap.txt'), 'a') as f:
         f.write(six.text_type('%.4f\n' % (lap)))
     with open(self.mkfile('snapshot_elapsed_time_sec.txt'), 'a') as f:
         f.write(six.text_type('%.3f\n' % (self.get_elapsed_time())))
     if self.outputParams['doSaveFullModel']:
         ModelWriter.save_model(
             hmodel, self.task_output_path, prefix,
             doSavePriorInfo=np.allclose(lap, 0.0),
             doLinkBest=True,
             doSaveObsModel=self.outputParams['doSaveObsModel'])
     if self.outputParams['doSaveTopicModel']:
         ModelWriter.saveTopicModel(
             hmodel, SS, self.task_output_path, prefix, **kwargs)
     ElapsedTimeLogger.stopEvent("io", "saveparams")
Beispiel #6
0
 def save_batch_local_params_to_memory(self, batchID, batchLP):
     ''' Store certain fields of the provided local parameters dict
           into "memory" for later retrieval.
         Fields to save determined by the memoLPkeys attribute of this alg.
     '''
     batchLP = dict(**batchLP)  # make a copy
     allkeys = list(batchLP.keys())
     for key in allkeys:
         if key != 'DocTopicCount':
             del batchLP[key]
     if len(list(batchLP.keys())) > 0:
         if self.algParams['doMemoizeLocalParams'] == 1:
             self.LPmemory[batchID] = batchLP
         elif self.algParams['doMemoizeLocalParams'] == 2:
             ElapsedTimeLogger.startEvent('io', 'savelocal')
             spDTC = sparsifyResp(batchLP['DocTopicCount'],
                                  self.algParams['nnzPerDocForStorage'])
             wc_D = batchLP['DocTopicCount'].sum(axis=1)
             wc_U = np.repeat(wc_D, self.algParams['nnzPerDocForStorage'])
             spDTC.data *= wc_U
             savepath = self.savedir.replace(os.environ['BNPYOUTDIR'], '')
             if os.path.exists('/ltmp/'):
                 savepath = '/ltmp/%s/' % (savepath)
             else:
                 savepath = '/tmp/%s/' % (savepath)
             from distutils.dir_util import mkpath
             mkpath(savepath)
             savepath = os.path.join(savepath, 'batch%d.npz' % (batchID))
             # Now actually save it!
             np.savez(savepath,
                      data=spDTC.data,
                      indices=spDTC.indices,
                      D=spDTC.shape[0],
                      K=spDTC.shape[1],
                      nnzPerDoc=spDTC.indptr[1])
             self.LPmemory[batchID] = savepath
             del batchLP
             del spDTC
             ElapsedTimeLogger.stopEvent('io', 'savelocal')
Beispiel #7
0
 def update_global_params(self, SS, rho=None,
         doLogElapsedTime=False,
         **kwargs):
     ''' Update (in-place) global parameters given provided suff stats.
         This is the M-step of EM.
     '''
     if doLogElapsedTime:
         ElapsedTimeLogger.startEvent('global', 'alloc')
     self.allocModel.update_global_params(SS, rho, **kwargs)
     if doLogElapsedTime:
         ElapsedTimeLogger.stopEvent('global', 'alloc')
         ElapsedTimeLogger.startEvent('global', 'obs')
     self.obsModel.update_global_params(SS, rho, **kwargs)
     if doLogElapsedTime:
         ElapsedTimeLogger.stopEvent('global', 'obs')
Beispiel #8
0
    def get_global_suff_stats(self, Data, LP,
            doLogElapsedTime=False,
            **kwargs):
        ''' Calculate sufficient statistics for each component.

        These stats summarize the data and local parameters
        assigned to each component.

        This is necessary prep for the Global Step update.
        '''
        if doLogElapsedTime:
            ElapsedTimeLogger.startEvent('local', 'allocsummary')
        SS = self.allocModel.get_global_suff_stats(Data, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'allocsummary')
            ElapsedTimeLogger.startEvent('local', 'obssummary')
        SS = self.obsModel.get_global_suff_stats(Data, SS, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'obssummary')
        return SS
Beispiel #9
0
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs,
                       dataName, allocModelName, obsModelName, algName,
                       doSaveToDisk, doWriteStdOut):
    """ Internal method (should never be called by end-user!)
        Executes learning for a particular job and particular taskid.

        Returns
        -------
        hmodel : bnpy HModel, fit to the data
        LP : Local parameter (LP) dict for the specific dataset
        RunInfo : dict of information about the run, with fields
        - 'loss' : final loss value for algorithm
        - 'loss_history' : vector of loss values over time
    """
    # Make shallow copies of input dicts, so we any modifications here
    # do not return to the caller.
    ReqArgs = dict(**ReqArgs)
    KwArgs = dict(**KwArgs)
    UnkArgs = dict(**UnkArgs)

    algseed = createUniqueRandomSeed(jobname, taskID=taskid)
    dataorderseed = createUniqueRandomSeed('', taskID=taskid)
    KwArgs[algName]['algseed'] = algseed
    KwArgs[algName]['dataorderseed'] = dataorderseed

    if algName in OnlineDataAlgSet:
        KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap']

    if isinstance(dataName, str):
        if os.path.exists(dataName):
            # dataName is a path to many data files on disk
            Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs,
                                                      KwArgs, dataorderseed)
            DataArgs = UnkArgs
            # Set the short name for this dataset,
            # so that the filepath for results is informative.
            if not hasattr(Data, 'name'):
                try:
                    Data.name = KwArgs['OnlineDataPrefs']['datasetName']
                except KeyError:
                    Data.name = 'UnknownDatasetName'
        else:
            DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs)
            Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed)
    else:
        Data = dataName
        InitData = dataName
        DataArgs = dict()
        assert isinstance(Data, bnpy.data.DataObj)
        if algName in OnlineDataAlgSet:
            OnlineDataArgs = KwArgs['OnlineDataPrefs']
            OnlineDataArgs['dataorderseed'] = dataorderseed

            DataArgs = getKwArgsForLoadData(Data, UnkArgs)
            OnlineDataArgs.update(DataArgs)  # add custom args
            Data = Data.to_iterator(**OnlineDataArgs)
    if hasattr(Data, 'name'):
        ReqArgs['dataName'] = Data.name
    if doSaveToDisk:
        task_output_path = make_task_output_path(ReqArgs,
                                                 KwArgs,
                                                 taskID=taskid)
        createEmptyOutputPathOnDisk(task_output_path)
        writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs)
    else:
        task_output_path = None
    KwArgs['OutputPrefs']['task_output_path'] = task_output_path
    jobID = configLoggingToConsoleAndFile(task_output_path, taskid,
                                          doSaveToDisk, doWriteStdOut)

    # Write descriptions to the log
    if taskid == 1 or jobID > 0:
        # Warn user about any unknown keyword arguments
        showWarningForUnknownArgs(UnkArgs, DataArgs)

        Log.info('Dataset Summary:')
        Log.info(Data.get_text_summary())
        Log.info(Data.get_stats_summary())

    # Create and initialize model parameters
    hmodel = make_initialized_model(
        InitData,
        seed=algseed,
        taskid=taskid,
        allocModelName=ReqArgs['allocModelName'],
        obsModelName=ReqArgs['obsModelName'],
        algName=ReqArgs['algName'],
        KwArgs=KwArgs,
        verbose=(taskid == 1 or jobID > 0),
    )

    # Create learning algorithm
    learnAlg = createLearnAlg(Data,
                              hmodel,
                              ReqArgs,
                              KwArgs,
                              algseed=algseed,
                              task_output_path=task_output_path)
    if learnAlg.hasMove('birth'):
        import bnpy.birthmove.BLogger as BirthLogger
        BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('delete'):
        import bnpy.deletemove.DLogger as DeleteLogger
        DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('merge'):
        import bnpy.mergemove.MLogger as MergeLogger
        MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('shuffle'):
        import bnpy.mergemove.SLogger as SLogger
        SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if str(type(hmodel.allocModel)).count('TopicModel'):
        import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger
        LocalStepLogger.configure(task_output_path, doSaveToDisk,
                                  doWriteStdOut)

    # Set up logging for how long each step of the alg takes.
    import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger
    ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'],
                                doSaveToDisk, doWriteStdOut)

    Log.info(
        'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' %
        (algName, taskid, nTask, algseed, dataorderseed))
    Log.info('task_output_path: %s' % (task_output_path))

    # Fit the model to the data!
    RunInfo = learnAlg.fit(hmodel, Data)
    RunInfo['UnkArgs'] = UnkArgs
    RunInfo['KwArgs'] = KwArgs
    RunInfo['ReqArgs'] = ReqArgs
    return hmodel, RunInfo
Beispiel #10
0
    def fit(self, hmodel, DataIterator, SS=None):
        ''' Run stochastic variational to fit hmodel parameters to Data.

        Returns
        --------
        Info : dict of run information.

        Post Condition
        --------
        hmodel updated in place with improved global parameters.
        '''
        self.set_start_time_now()
        LP = None
        rho = 1.0  # Learning rate
        nBatch = float(DataIterator.nBatch)

        # Set-up progress-tracking variables
        iterid = -1
        lapFrac = np.maximum(0, self.algParams['startLap'] - 1.0 / nBatch)
        if lapFrac > 0:
            # When restarting an existing run,
            #  need to start with last update for final batch from previous lap
            DataIterator.lapID = int(np.ceil(lapFrac)) - 1
            DataIterator.curLapPos = nBatch - 2
            iterid = int(nBatch * lapFrac) - 1

        # Save initial state
        self.saveParams(lapFrac, hmodel)

        # Custom func hook
        self.eval_custom_func(isInitial=1,
                              **makeDictOfAllWorkspaceVars(**vars()))
        ElapsedTimeLogger.writeToLogOnLapCompleted(lapFrac)

        if self.algParams['doMemoELBO']:
            SStotal = None
            SSPerBatch = dict()
        else:
            loss_running_sum = 0
            loss_per_batch = np.zeros(nBatch)
        while DataIterator.has_next_batch():

            # Grab new data
            Dchunk = DataIterator.get_next_batch()
            batchID = DataIterator.batchID
            Dchunk.batchID = batchID

            # Update progress-tracking variables
            iterid += 1
            lapFrac += 1.0 / nBatch
            self.lapFrac = lapFrac
            nLapsCompleted = lapFrac - self.algParams['startLap']
            self.set_random_seed_at_lap(lapFrac)

            # E step
            self.algParamsLP['batchID'] = batchID
            self.algParamsLP['lapFrac'] = lapFrac  # logging
            if batchID in self.LPmemory:
                batchLP = self.load_batch_local_params_from_memory(batchID)
            else:
                batchLP = None
            LP = hmodel.calc_local_params(Dchunk,
                                          batchLP,
                                          doLogElapsedTime=True,
                                          **self.algParamsLP)
            rho = (1 + iterid + self.rhodelay)**(-1.0 * self.rhoexp)
            if self.algParams['doMemoELBO']:
                # SS step. Scale at size of current batch.
                SS = hmodel.get_global_suff_stats(Dchunk,
                                                  LP,
                                                  doLogElapsedTime=True,
                                                  doPrecompEntropy=True)
                if self.algParams['doMemoizeLocalParams']:
                    self.save_batch_local_params_to_memory(batchID, LP)
                # Incremental updates for whole-dataset stats
                # Must happen before applification.
                if batchID in SSPerBatch:
                    SStotal -= SSPerBatch[batchID]
                if SStotal is None:
                    SStotal = SS.copy()
                else:
                    SStotal += SS
                SSPerBatch[batchID] = SS.copy()

                # Scale up to size of whole dataset.
                if hasattr(Dchunk, 'nDoc'):
                    ampF = Dchunk.nDocTotal / float(Dchunk.nDoc)
                    SS.applyAmpFactor(ampF)
                else:
                    ampF = Dchunk.nObsTotal / float(Dchunk.nObs)
                    SS.applyAmpFactor(ampF)
                # M step with learning rate
                hmodel.update_global_params(SS, rho, doLogElapsedTime=True)
                # ELBO step
                assert not SStotal.hasAmpFactor()
                loss = -1 * hmodel.calc_evidence(
                    SS=SStotal,
                    doLogElapsedTime=True,
                    afterGlobalStep=not self.algParams['useSlackTermsInELBO'])
            else:
                # SS step. Scale at size of current batch.
                SS = hmodel.get_global_suff_stats(Dchunk,
                                                  LP,
                                                  doLogElapsedTime=True)

                # Scale up to size of whole dataset.
                if hasattr(Dchunk, 'nDoc'):
                    ampF = Dchunk.nDocTotal / float(Dchunk.nDoc)
                    SS.applyAmpFactor(ampF)
                else:
                    ampF = Dchunk.nObsTotal / float(Dchunk.nObs)
                    SS.applyAmpFactor(ampF)

                # M step with learning rate
                hmodel.update_global_params(SS, rho, doLogElapsedTime=True)

                # ELBO step
                assert SS.hasAmpFactor()
                cur_batch_loss = -1 * hmodel.calc_evidence(
                    Dchunk, SS, LP, doLogElapsedTime=True)
                if loss_per_batch[batchID] != 0:
                    loss_running_sum -= loss_per_batch[batchID]
                loss_running_sum += cur_batch_loss
                loss_per_batch[batchID] = cur_batch_loss
                loss = loss_running_sum / nBatch

            # Display progress
            self.updateNumDataProcessed(Dchunk.get_size())
            if self.isLogCheckpoint(lapFrac, iterid):
                self.printStateToLog(hmodel, loss, lapFrac, iterid, rho=rho)

            # Save diagnostics and params
            if self.isSaveDiagnosticsCheckpoint(lapFrac, iterid):
                self.saveDiagnostics(lapFrac, SS, loss)
            if self.isSaveParamsCheckpoint(lapFrac, iterid):
                self.saveParams(lapFrac, hmodel, tryToSparsifyOutput=1)
                # don't save SS here, since its for one batch only
            self.eval_custom_func(**makeDictOfAllWorkspaceVars(**vars()))

            if self.isLastBatch(lapFrac):
                ElapsedTimeLogger.writeToLogOnLapCompleted(lapFrac)
            # .... end loop over data

        # Finished! Save, print and exit
        self.printStateToLog(hmodel, loss, lapFrac, iterid, isFinal=1)
        self.saveParams(lapFrac, hmodel, SS)
        self.eval_custom_func(isFinal=1,
                              **makeDictOfAllWorkspaceVars(**vars()))

        return self.buildRunInfo(Data=DataIterator, loss=loss, SS=SS)