def _make_xcandidate_LP(xbigModel, Data, xbigSS, xfreshSS, xfreshLP, k, **kwargs): rfreshLP = _delete_comps_from_LP(Data, xbigModel, xfreshLP, k) rfreshSS = xbigModel.get_global_suff_stats(Data, rfreshLP, doPrecompEntropy=True) rbigModel = xbigModel.copy() rbigSS = xbigSS.copy() rbigSS.removeComp(rbigSS.K - 1) # just chop off the last one in stickbrk order qbigSS = rbigSS + rfreshSS rbigModel.update_global_params(qbigSS) # We might consider another pass to make sure the alloc params converge rbigModel.allocModel.update_global_params(qbigSS) if 'cleanupDeleteNumIters' in kwargs and kwargs['cleanupDeleteNumIters']: nIters = kwargs['cleanupDeleteNumIters'] for trial in xrange(nIters): rfreshLP = rbigModel.calc_local_params(Data, rfreshLP, methodLP='memo', nCoordAscentItersLP=10) rfreshSS = rbigModel.get_global_suff_stats(Data, rfreshLP, doPrecompEntropy=1) qbigSS = rbigSS + rfreshSS rbigModel.update_global_params(qbigSS) rfreshELBO = rbigModel.calc_evidence(SS=rfreshSS) log('%d %.6e' % (trial, rfreshELBO)) else: rfreshELBO = rbigModel.calc_evidence(SS=rfreshSS) return rbigModel, rbigSS, rfreshSS, rfreshELBO, rfreshLP
def expand_then_refine(freshModel, freshSS, freshData, bigModel, bigSS, **kwargs): ''' Create expanded model with K + K' comps, then refine components K+1, K+2, ... K+K' via several VB iterations Guarantees that original comps of bigModel.obsModel are not altered. Returns ------- xbigModel : HModel with K + Kfresh comps * allocModel has scale bigSS + freshSS * obsModel has scale bigSS + freshSS xbigSS : SuffStatBag with K + Kfresh comps * has scale bigSS + freshSS xfreshSS : SuffStatBag with K + Kfresh comps * has scale freshSS AdjustInfo : dict with adjustment factors ReplaceInfo : dict with replacement factors ''' logPhase('Expansion') Korig = bigSS.K Info = dict() xbigModel = bigModel.copy() xbigSS = bigSS.copy(includeELBOTerms=False, includeMergeTerms=False) if kwargs['expandAdjustSuffStats'] \ and hasattr(freshModel.allocModel, 'insertCompsIntoSuffStatBag'): xbigSS, AInfo, RInfo = xbigModel.allocModel.insertCompsIntoSuffStatBag( xbigSS, freshSS) log('Specialized, model-specific expansion') log('rho[K+1] ... rho[K+Knew]') logProbVector(xbigModel.allocModel.rho[Korig:]) else: xbigSS.insertComps(freshSS) AInfo = None RInfo = None # Create expanded model, K + Kfresh comps Kx = xbigSS.K if xbigModel.allocModel.K < Kx: xbigModel.allocModel.update_global_params(xbigSS) if xbigModel.obsModel.K < Kx: xbigModel.obsModel.update_global_params(xbigSS) xbigSS.subtractSpecificComps(freshSS, list(range(bigSS.K, bigSS.K + freshSS.K))) if kwargs['birthDebug']: Info['xbigModelInit'] = xbigModel.copy() # Refine expanded model with VB iterations if kwargs['refineNumIters'] > 0: xbigModel, xfreshSS, xfreshLP, xInfo = \ refine_expanded_model_with_VB_iters( xbigModel, freshData, xbigSS=xbigSS, Korig=bigSS.K, **kwargs) else: xfreshSS = xbigSS.copy() xfreshSS.setAllFieldsToZero() for key in list(xfreshSS._FieldDims.keys()): if xfreshSS._FieldDims[key] is None: continue arr = getattr(xfreshSS, key) arr[bigSS.K:] = getattr(freshSS, key) xfreshLP = None xInfo = dict(origIDs=list()) if kwargs['birthDebug']: Info['xbigModelRefined'] = xbigModel.copy() Info['traceN'] = xInfo['traceN'] Info['traceBeta'] = xInfo['traceBeta'] Info['traceELBO'] = xInfo['traceELBO'] AInfo = _delete_from_AInfo(AInfo, xInfo['origIDs'], Kx) if hasattr(xfreshSS, 'nDoc'): assert xbigSS.nDoc == bigSS.nDoc assert xfreshSS.nDoc == freshData.nDoc if kwargs['cleanupDeleteToImprove']: Kx = xbigSS.K xbigModel, xbigSS, xfreshSS, xfreshELBO, origIDs = \ BirthCleanup.delete_comps_from_expanded_model_to_improve_ELBO( freshData, xbigModel, xbigSS, xfreshSS, Korig=bigSS.K, xfreshLP=xfreshLP, **kwargs) AInfo = _delete_from_AInfo(AInfo, origIDs, Kx) if kwargs['birthDebug']: Info['xbigModelPostDelete'] = xbigModel.copy() Info['ELBOPostDelete'] = xfreshELBO if hasattr(xfreshSS, 'nDoc'): assert xbigSS.nDoc == bigSS.nDoc assert xfreshSS.nDoc == freshData.nDoc xbigSS += xfreshSS Info['AInfo'] = AInfo Info['RInfo'] = RInfo return xbigModel, xbigSS, xfreshSS, Info
def refine_expanded_model_with_VB_iters(xbigModel, freshData, xbigSS=None, Korig=0, **kwargs): ''' Execute multiple local/global update steps for the current model Args -------- xbigSS : SuffStatBag, with K + Kfresh comps, scale equal to bigData only Returns -------- model : HModel, with K + Kfresh comps scale equal to bigData + freshData freshSS : SuffStatBag, with K + Kfresh comps scale equal to freshData freshLP : dict of local parameters for freshData Updates (in-place) ---------- xbigSS : SuffStatBag, with K + Kfresh comps scale with equal to bigData only ''' logPhase('Refinement') xInfo = dict() origIDs = list(range(0, xbigSS.K)) nIters = kwargs['refineNumIters'] traceBeta = np.zeros((nIters, xbigSS.K)) traceN = np.zeros((nIters, xbigSS.K)) traceELBO = np.zeros(nIters) xfreshLP = None for riter in range(nIters): xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs) xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP) traceN[riter, origIDs] = xfreshSS.N if kwargs['birthDebug']: traceBeta[riter, origIDs] = xbigModel.allocModel.get_active_comp_probs() traceELBO[riter] = xbigModel.calc_evidence(freshData, xfreshSS, xfreshLP) if riter < 3 or (riter + 1) % 5 == 0: logPosVector(traceN[riter, Korig:], label='iter %3d' % (riter + 1)) # For all but last iteration, attempt removing empty topics if kwargs[ 'cleanupDeleteEmpty'] and riter < kwargs['refineNumIters'] - 1: for k in reversed(list(range(Korig, xfreshSS.K))): if xfreshSS.N[k] < kwargs['cleanupMinSize']: xfreshSS.removeComp(k) xbigSS.removeComp(xbigSS.K - 1) # last in order! del origIDs[k] if xfreshSS.K == Korig: msg = "BIRTH failed. After refining, no comps > cleanupMinSize." raise BirthProposalError(msg) xbigSS += xfreshSS xbigModel.allocModel.update_global_params(xbigSS) xbigModel.obsModel.update_global_params(xbigSS) xbigSS -= xfreshSS xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs) xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP) log('Final Assignment Counts') logPosVector(xfreshSS.N[Korig:], label='final') if kwargs['birthDebug']: xInfo['traceBeta'] = traceBeta xInfo['traceN'] = traceN xInfo['traceELBO'] = traceELBO xInfo['origIDs'] = origIDs return xbigModel, xfreshSS, xfreshLP, xInfo
def create_model_with_new_comps(bigModel, bigSS, freshData, Q=None, Plan=None, **kwargs): ''' Returns ------- freshModel : HModel with Kfresh components, scale *may not* be consistent with target dataset freshSS : SuffStatBag with Kfresh components, scale will be consistent with target dataset ''' Info = dict() freshModel = bigModel.copy() if kwargs['creationRoutine'] == 'targetWordFreq': freshModel.set_global_params( beta=np.ones(1), K=1, topics=Plan['targetWordFreq'][np.newaxis, :], wordcountTotal=freshData.word_count.sum()) elif kwargs['creationRoutine'] == 'findmissingtopics': freshModel = create_new_model_findmissingtopics( freshModel, freshData, bigModel, **kwargs) elif kwargs['creationRoutine'] == 'xspectral': assert Q is not None freshModel = create_new_model_expandedspectral(freshModel, Q, freshData, bigModel, **kwargs) elif kwargs['creationRoutine'] == 'spectralOnTarget': freshModel = create_new_model_spectralOnTarget(freshModel, freshData, bigModel, **kwargs) else: freshModel.init_global_params(freshData, K=kwargs['Kfresh'], initname=kwargs['creationRoutine'], **kwargs) logPhase('Creation') log('CreationRoutine: ' + kwargs['creationRoutine'], 'debug') log('Kfresh=%d' % (freshModel.obsModel.K), 'debug') if not kwargs['creationDoUpdateFresh']: # Create freshSS that would produce (nearly) same freshModel.obsModel # after a call to update_global_params freshSS._Fields.setAllFieldsToZero() if hasattr(freshSS, 'WordCounts'): topics = freshSS.WordCounts priorvec = freshModel.obsModel.obsPrior.lamvec for k in range(freshSS.K): topics[k, :] = freshModel.obsModel.comp[k].lamvec - priorvec freshSS.setField('WordCounts', topics, dims=('K', 'D')) return freshModel, freshSS, Info # Record initial model for posterity if kwargs['birthDebug']: Info['freshModelInit'] = freshModel.copy() # Complete several iterations to improve this fresh proposal for step in range(kwargs['creationNumIters']): freshLP = freshModel.calc_local_params(freshData, **fastParams) freshSS = freshModel.get_global_suff_stats(freshData, freshLP) freshModel.update_global_params(freshSS) if step < 3 or (step + 1) % 10 == 0: logPosVector(freshSS.N, label='iter %3d' % (step + 1), level='debug') if step > 1: maxDiff = np.max(np.abs(freshSS.N - prevN)) if maxDiff < 1.0: break prevN = freshSS.N.copy() logPosVector(freshSS.N, label='after creation', level='moreinfo') if kwargs['birthDebug']: Info['freshModelRefined'] = freshModel.copy() if kwargs['cleanupDeleteEmpty']: Kbefore = freshSS.K freshModel, freshSS = BirthCleanup.delete_empty_comps(freshData, freshModel, freshSS, Korig=0, **kwargs) freshLP = freshModel.calc_local_params(freshData) freshSS = freshModel.get_global_suff_stats(freshData, freshLP) freshModel.update_global_params(freshSS) if freshSS.K < Kbefore: msg = 'after remove empty (size < %d)' % (kwargs['cleanupMinSize']) logPosVector(freshSS.N, label=msg, level='moreinfo') if kwargs['cleanupDeleteToImproveFresh']: freshModel, freshSS, ELBO = BirthCleanup.delete_comps_to_improve_ELBO( freshData, freshModel, LP=freshLP) Info['evBound'] = ELBO if kwargs['birthDebug']: Info['freshModelPostDelete'] = freshModel.copy() elif kwargs['cleanupMergeToImproveFresh']: Korig = freshSS.K while freshSS.K > 1: mPairIDs, MM = MergePlanner.preselect_candidate_pairs( freshModel, freshSS, preselect_routine='wholeELBO', doLimitNumPairs=0, returnScoreMatrix=1, **kwargs) freshLP = freshModel.calc_local_params(freshData) freshSS = freshModel.get_global_suff_stats(freshData, freshLP, doPrecompEntropy=1, doPrecompMergeEntropy=1, mPairIDs=mPairIDs) freshModel.update_global_params(freshSS) freshELBO = freshModel.calc_evidence(SS=freshSS) freshModel, freshSS, freshELBO, Info = \ MergeMove.run_many_merge_moves( freshModel, freshSS, freshELBO, mPairIDs, M=MM, isBirthCleanup=1, logFunc=log) if len(Info['AcceptedPairs']) == 0: break if freshSS.K < Korig: msg = 'after merges' logPosVector(freshSS.N, label=msg, level='moreinfo') if freshSS.K < 2: msg = "BIRTH failed. Fresh proposal does not prefer multiple comps." raise BirthProposalError(msg) return freshModel, freshSS, Info
def delete_comps_from_expanded_model_to_improve_ELBO(Data, xbigModel, xbigSS, xfreshSS, xfreshLP=None, Korig=0, **kwargs): ''' Attempts deleting components K, K-1, K-2, ... Korig, keeping (and building on) any proposals that improve the ELBO Returns --------- model : HModel with Knew comps SS : SuffStatBag with Knew comps ELBO : evidence lower bound for the returned model ''' logPhase('Cleanup') K = xbigSS.K assert xbigSS.K == xfreshSS.K assert xbigModel.obsModel.K == K origIDs = range(0, K) if K == 1: return xbigModel, xbigSS, xfreshSS, origIDs xfreshELBO = xbigModel.calc_evidence(SS=xfreshSS) for k in reversed(range(Korig, K)): if kwargs['cleanupDeleteViaLP']: rbigModel, rbigSS, rfreshSS, rfreshELBO, rfreshLP = \ _make_xcandidate_LP( xbigModel, Data, xbigSS, xfreshSS, xfreshLP, k, **kwargs) else: rbigModel, rbigSS, rfreshSS, rfreshELBO = _make_xcandidate( xbigModel, Data, xbigSS, xfreshSS, k) # If ELBO has improved, set current model to delete component k didAccept = False if rfreshELBO >= xfreshELBO: log('Deletion accepted. prop %.5e > cur %.5e' % (rfreshELBO, xfreshELBO)) logPosVector(xfreshSS.N[Korig:]) xbigSS = rbigSS xfreshSS = rfreshSS xbigModel = rbigModel xfreshELBO = rfreshELBO if kwargs['cleanupDeleteViaLP']: xfreshLP = rfreshLP didAccept = True del origIDs[k] if xfreshSS.K == 1: break # end loop over comps to delete if xbigSS.K == Korig and kwargs['cleanupRaiseErrorWhenAllDeleted']: log('FAILED. Deleting all new comps improves ELBO.') msg = "FAILED. After expansion, deleting all new comps improves ELBO." raise BirthProposalError(msg) return xbigModel, xbigSS, xfreshSS, xfreshELBO, origIDs
def run_birth_move(bigModel, bigSS, freshData, Q=None, Plan=None, **kwargsIN): ''' Run birth move on provided target data, creating up to Kfresh new comps Returns ------- bigmodel bigSS MoveInfo ''' logPhase('Target Data') if 'ktarget' in Plan: ktarget = Plan['ktarget'] if 'targetUID' in Plan: know = np.flatnonzero(bigSS.uIDs == Plan['targetUID']) if know.size == 1: sizeNow = bigSS.getCountVec()[know[0]] else: sizeNow = 0 log( 'target comp = %d. Size now %d. Size at selection %d.' % (Plan['targetUID'], sizeNow, Plan['count']), 'moreinfo') else: log('ktarget= %d.' % (ktarget), 'moreinfo') log(freshData.get_stats_summary(), 'debug') kwargs = dict(**kwargsIN) # make local copy! origids = dict(bigModel=id(bigModel), bigSS=id(bigSS)) try: if bigSS is None: msg = "SKIPPED. SS must be valid SuffStatBag, not None." raise BirthProposalError(msg) if bigSS.K + kwargs['Kfresh'] > kwargs['Kmax']: kwargs['Kfresh'] = kwargs['Kmax'] - bigSS.K if kwargs['Kfresh'] < 1: msg = "SKIPPED. Reached upper limit of Kmax=%d comps." msg = msg % (kwargs['Kmax']) raise BirthProposalError(msg) # Determine baseline ELBO if kwargs['birthVerifyELBOIncrease']: curbigModel = bigModel.copy() nStep = 3 curfreshLP = None for step in range(nStep): doELBO = (step == nStep - 1) # only on last step curfreshLP = curbigModel.calc_local_params( freshData, curfreshLP, **kwargs) curfreshSS = curbigModel.get_global_suff_stats( freshData, curfreshLP, doPrecompEntropy=doELBO) if not doELBO: # all but the last step curbigModel.update_global_params(bigSS + curfreshSS) curELBO = curbigModel.calc_evidence(SS=curfreshSS) # Create freshModel, freshSS, both with Kfresh comps # freshSS has scale freshData # freshModel has arbitrary scale freshModel, freshSS, freshInfo = \ BirthCreate.create_model_with_new_comps( bigModel, bigSS, freshData, Q=Q, Plan=Plan, **kwargs) # Visualize, if desired if 'doVizBirth' in kwargs and kwargs['doVizBirth']: VizBirth.viz_birth_proposal(bigModel, freshModel, Plan, curELBO=None, propELBO=None, **kwargs) input('>>>') from matplotlib import pylab pylab.close('all') # Create xbigModel and xbigSS, with K + Kfresh comps # freshData can be assigned to any of the K+Kfresh comps # so, any of the K+Kfresh comps may be changed # but original comps won't lose influence of bigSS # * xbigSS has scale bigData + freshData # * xbigModel has scale bigData + freshData if kwargs['expandOrder'] == 'expandThenRefine': xbigModel, xbigSS, xfreshSS, xInfo = \ BirthRefine.expand_then_refine( freshModel, freshSS, freshData, bigModel, bigSS, **kwargs) else: raise NotImplementedError('TODO') if kwargs['birthVerifyELBOIncrease']: logPhase('Evaluation') assert xfreshSS.hasELBOTerms() propELBO = xbigModel.calc_evidence(SS=xfreshSS) didPass, ELBOmsg = make_acceptance_decision(curELBO, propELBO) log(ELBOmsg) else: didPass = True ELBOmsg = '' propELBO = None # needed for kwarg for viz_birth_proposal curELBO = None Kcur = bigSS.K Ktotal = xbigSS.K birthCompIDs = list(range(Kcur, Ktotal)) # Reject. Abandon the move. if not didPass: msg = "BIRTH REJECTED. Did not explain target better than current." raise BirthProposalError(msg) assert xbigModel.obsModel.K == xbigSS.K # Create dict of info about this birth move msg = 'BIRTH ACCEPTED. %d fresh comps.' % (len(birthCompIDs)) log(msg, 'info') MoveInfo = dict( didAddNew=True, msg=msg, AdjustInfo=xInfo['AInfo'], ReplaceInfo=xInfo['RInfo'], modifiedCompIDs=[], birthCompIDs=birthCompIDs, Korig=bigSS.K, ) MoveInfo.update(xInfo) MoveInfo.update(freshInfo) assert not xbigSS.hasELBOTerms() assert not xbigSS.hasMergeTerms() xfreshSS.removeELBOTerms() if kwargs['birthRetainExtraMass']: MoveInfo['extraSS'] = xfreshSS MoveInfo['modifiedCompIDs'] = list(range(Ktotal)) else: # Restore xbigSS to same scale as original "big" dataset xbigSS -= xfreshSS assert np.allclose(xbigSS.N.sum(), bigSS.N.sum()) if bigSS.hasMergeTerms(): MergeTerms = bigSS._MergeTerms.copy() MergeTerms.insertEmptyComps(Ktotal - Kcur) xbigSS.restoreMergeTerms(MergeTerms) if bigSS.hasELBOTerms(): ELBOTerms = bigSS._ELBOTerms.copy() ELBOTerms.insertEmptyComps(Ktotal - Kcur) if xInfo['AInfo'] is not None: for key in xInfo['AInfo']: if hasattr(ELBOTerms, key): arr = getattr(ELBOTerms, key) + bigSS.nDoc * xInfo['AInfo'][key] ELBOTerms.setField(key, arr, dims='K') if xInfo['RInfo'] is not None: for key in xInfo['RInfo']: if hasattr(ELBOTerms, key): ELBOTerms.setField(key, bigSS.nDoc * xInfo['RInfo'][key], dims=None) xbigSS.restoreELBOTerms(ELBOTerms) return xbigModel, xbigSS, MoveInfo except BirthProposalError as e: # We execute this code when birth fails for any reason, including: # * user-specified Kmax limit reached # * cleanup phase removed all new components # Verify guarantees that input model and input suff stats haven't # changed assert origids['bigModel'] == id(bigModel) assert origids['bigSS'] == id(bigSS) # Write reason for failure to log log(str(e), 'moreinfo') # Return failure info MoveInfo = dict(didAddNew=False, msg=str(e), modifiedCompIDs=[], birthCompIDs=[]) return bigModel, bigSS, MoveInfo