def create_model_with_new_comps(bigModel, bigSS, freshData, Q=None, Plan=None, **kwargs): ''' Returns ------- freshModel : HModel with Kfresh components, scale *may not* be consistent with target dataset freshSS : SuffStatBag with Kfresh components, scale will be consistent with target dataset ''' Info = dict() freshModel = bigModel.copy() if kwargs['creationRoutine'] == 'targetWordFreq': freshModel.set_global_params( beta=np.ones(1), K=1, topics=Plan['targetWordFreq'][np.newaxis, :], wordcountTotal=freshData.word_count.sum()) elif kwargs['creationRoutine'] == 'findmissingtopics': freshModel = create_new_model_findmissingtopics( freshModel, freshData, bigModel, **kwargs) elif kwargs['creationRoutine'] == 'xspectral': assert Q is not None freshModel = create_new_model_expandedspectral(freshModel, Q, freshData, bigModel, **kwargs) elif kwargs['creationRoutine'] == 'spectralOnTarget': freshModel = create_new_model_spectralOnTarget(freshModel, freshData, bigModel, **kwargs) else: freshModel.init_global_params(freshData, K=kwargs['Kfresh'], initname=kwargs['creationRoutine'], **kwargs) logPhase('Creation') log('CreationRoutine: ' + kwargs['creationRoutine'], 'debug') log('Kfresh=%d' % (freshModel.obsModel.K), 'debug') if not kwargs['creationDoUpdateFresh']: # Create freshSS that would produce (nearly) same freshModel.obsModel # after a call to update_global_params freshSS._Fields.setAllFieldsToZero() if hasattr(freshSS, 'WordCounts'): topics = freshSS.WordCounts priorvec = freshModel.obsModel.obsPrior.lamvec for k in range(freshSS.K): topics[k, :] = freshModel.obsModel.comp[k].lamvec - priorvec freshSS.setField('WordCounts', topics, dims=('K', 'D')) return freshModel, freshSS, Info # Record initial model for posterity if kwargs['birthDebug']: Info['freshModelInit'] = freshModel.copy() # Complete several iterations to improve this fresh proposal for step in range(kwargs['creationNumIters']): freshLP = freshModel.calc_local_params(freshData, **fastParams) freshSS = freshModel.get_global_suff_stats(freshData, freshLP) freshModel.update_global_params(freshSS) if step < 3 or (step + 1) % 10 == 0: logPosVector(freshSS.N, label='iter %3d' % (step + 1), level='debug') if step > 1: maxDiff = np.max(np.abs(freshSS.N - prevN)) if maxDiff < 1.0: break prevN = freshSS.N.copy() logPosVector(freshSS.N, label='after creation', level='moreinfo') if kwargs['birthDebug']: Info['freshModelRefined'] = freshModel.copy() if kwargs['cleanupDeleteEmpty']: Kbefore = freshSS.K freshModel, freshSS = BirthCleanup.delete_empty_comps(freshData, freshModel, freshSS, Korig=0, **kwargs) freshLP = freshModel.calc_local_params(freshData) freshSS = freshModel.get_global_suff_stats(freshData, freshLP) freshModel.update_global_params(freshSS) if freshSS.K < Kbefore: msg = 'after remove empty (size < %d)' % (kwargs['cleanupMinSize']) logPosVector(freshSS.N, label=msg, level='moreinfo') if kwargs['cleanupDeleteToImproveFresh']: freshModel, freshSS, ELBO = BirthCleanup.delete_comps_to_improve_ELBO( freshData, freshModel, LP=freshLP) Info['evBound'] = ELBO if kwargs['birthDebug']: Info['freshModelPostDelete'] = freshModel.copy() elif kwargs['cleanupMergeToImproveFresh']: Korig = freshSS.K while freshSS.K > 1: mPairIDs, MM = MergePlanner.preselect_candidate_pairs( freshModel, freshSS, preselect_routine='wholeELBO', doLimitNumPairs=0, returnScoreMatrix=1, **kwargs) freshLP = freshModel.calc_local_params(freshData) freshSS = freshModel.get_global_suff_stats(freshData, freshLP, doPrecompEntropy=1, doPrecompMergeEntropy=1, mPairIDs=mPairIDs) freshModel.update_global_params(freshSS) freshELBO = freshModel.calc_evidence(SS=freshSS) freshModel, freshSS, freshELBO, Info = \ MergeMove.run_many_merge_moves( freshModel, freshSS, freshELBO, mPairIDs, M=MM, isBirthCleanup=1, logFunc=log) if len(Info['AcceptedPairs']) == 0: break if freshSS.K < Korig: msg = 'after merges' logPosVector(freshSS.N, label=msg, level='moreinfo') if freshSS.K < 2: msg = "BIRTH failed. Fresh proposal does not prefer multiple comps." raise BirthProposalError(msg) return freshModel, freshSS, Info
def refine_expanded_model_with_VB_iters(xbigModel, freshData, xbigSS=None, Korig=0, **kwargs): ''' Execute multiple local/global update steps for the current model Args -------- xbigSS : SuffStatBag, with K + Kfresh comps, scale equal to bigData only Returns -------- model : HModel, with K + Kfresh comps scale equal to bigData + freshData freshSS : SuffStatBag, with K + Kfresh comps scale equal to freshData freshLP : dict of local parameters for freshData Updates (in-place) ---------- xbigSS : SuffStatBag, with K + Kfresh comps scale with equal to bigData only ''' logPhase('Refinement') xInfo = dict() origIDs = list(range(0, xbigSS.K)) nIters = kwargs['refineNumIters'] traceBeta = np.zeros((nIters, xbigSS.K)) traceN = np.zeros((nIters, xbigSS.K)) traceELBO = np.zeros(nIters) xfreshLP = None for riter in range(nIters): xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs) xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP) traceN[riter, origIDs] = xfreshSS.N if kwargs['birthDebug']: traceBeta[riter, origIDs] = xbigModel.allocModel.get_active_comp_probs() traceELBO[riter] = xbigModel.calc_evidence(freshData, xfreshSS, xfreshLP) if riter < 3 or (riter + 1) % 5 == 0: logPosVector(traceN[riter, Korig:], label='iter %3d' % (riter + 1)) # For all but last iteration, attempt removing empty topics if kwargs[ 'cleanupDeleteEmpty'] and riter < kwargs['refineNumIters'] - 1: for k in reversed(list(range(Korig, xfreshSS.K))): if xfreshSS.N[k] < kwargs['cleanupMinSize']: xfreshSS.removeComp(k) xbigSS.removeComp(xbigSS.K - 1) # last in order! del origIDs[k] if xfreshSS.K == Korig: msg = "BIRTH failed. After refining, no comps > cleanupMinSize." raise BirthProposalError(msg) xbigSS += xfreshSS xbigModel.allocModel.update_global_params(xbigSS) xbigModel.obsModel.update_global_params(xbigSS) xbigSS -= xfreshSS xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs) xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP) log('Final Assignment Counts') logPosVector(xfreshSS.N[Korig:], label='final') if kwargs['birthDebug']: xInfo['traceBeta'] = traceBeta xInfo['traceN'] = traceN xInfo['traceELBO'] = traceELBO xInfo['origIDs'] = origIDs return xbigModel, xfreshSS, xfreshLP, xInfo
def delete_comps_from_expanded_model_to_improve_ELBO(Data, xbigModel, xbigSS, xfreshSS, xfreshLP=None, Korig=0, **kwargs): ''' Attempts deleting components K, K-1, K-2, ... Korig, keeping (and building on) any proposals that improve the ELBO Returns --------- model : HModel with Knew comps SS : SuffStatBag with Knew comps ELBO : evidence lower bound for the returned model ''' logPhase('Cleanup') K = xbigSS.K assert xbigSS.K == xfreshSS.K assert xbigModel.obsModel.K == K origIDs = range(0, K) if K == 1: return xbigModel, xbigSS, xfreshSS, origIDs xfreshELBO = xbigModel.calc_evidence(SS=xfreshSS) for k in reversed(range(Korig, K)): if kwargs['cleanupDeleteViaLP']: rbigModel, rbigSS, rfreshSS, rfreshELBO, rfreshLP = \ _make_xcandidate_LP( xbigModel, Data, xbigSS, xfreshSS, xfreshLP, k, **kwargs) else: rbigModel, rbigSS, rfreshSS, rfreshELBO = _make_xcandidate( xbigModel, Data, xbigSS, xfreshSS, k) # If ELBO has improved, set current model to delete component k didAccept = False if rfreshELBO >= xfreshELBO: log('Deletion accepted. prop %.5e > cur %.5e' % (rfreshELBO, xfreshELBO)) logPosVector(xfreshSS.N[Korig:]) xbigSS = rbigSS xfreshSS = rfreshSS xbigModel = rbigModel xfreshELBO = rfreshELBO if kwargs['cleanupDeleteViaLP']: xfreshLP = rfreshLP didAccept = True del origIDs[k] if xfreshSS.K == 1: break # end loop over comps to delete if xbigSS.K == Korig and kwargs['cleanupRaiseErrorWhenAllDeleted']: log('FAILED. Deleting all new comps improves ELBO.') msg = "FAILED. After expansion, deleting all new comps improves ELBO." raise BirthProposalError(msg) return xbigModel, xbigSS, xfreshSS, xfreshELBO, origIDs