def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs): ''' Calculate the sufficient statistics for global parameter updates Only adds stats relevant for this allocModel. Other stats are added by the obsModel. Args ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) Returns ------- SS : SuffStats for K components, with field N : vector of length-K, effective number of observations assigned to each comp ''' Nvec = np.sum( LP['resp'], axis=0 ) SS = SuffStatBag(K=Nvec.size, D=Data.dim) SS.setField('N', Nvec, dims=('K')) if doPrecompEntropy is not None: ElogqZ_vec = self.E_logqZ(LP) SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K')) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs): """ Calculate the sufficient statistics for global parameter updates Only adds stats relevant for this allocModel. Other stats are added by the obsModel. Args ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) Returns ------- SS : SuffStats for K components, with field N : vector of length-K, effective number of observations assigned to each comp """ Nvec = np.sum(LP["resp"], axis=0) SS = SuffStatBag(K=Nvec.size, D=Data.dim) SS.setField("N", Nvec, dims=("K")) if doPrecompEntropy is not None: ElogqZ_vec = self.E_logqZ(LP) SS.setELBOTerm("ElogqZ", ElogqZ_vec, dims=("K")) return SS
def calcSummaryStats(Data, LP, doPrecompEntropy=0, doPrecompMergeEntropy=0, mPairIDs=None, trackDocUsage=0, **kwargs): ''' Calculate summary statistics for given data slice and local params. Returns ------- SS : SuffStatBag ''' if mPairIDs is None: M = 0 else: M = len(mPairIDs) resp = LP['resp'] K = resp.shape[1] startLocIDs = Data.doc_range[:-1] StartStateCount = np.sum(resp[startLocIDs], axis=0) N = np.sum(resp, axis=0) if 'TransCount' in LP: TransStateCount = np.sum(LP['TransCount'], axis=0) else: respPair = LP['respPair'] TransStateCount = np.sum(respPair, axis=0) SS = SuffStatBag(K=K, D=Data.dim, M=M) SS.setField('StartStateCount', StartStateCount, dims=('K')) SS.setField('TransStateCount', TransStateCount, dims=('K', 'K')) SS.setField('N', N, dims=('K')) SS.setField('nDoc', Data.nDoc, dims=None) if doPrecompEntropy or 'Htable' in LP: # Compute entropy terms! # 'Htable', 'Hstart' will both be in Mdict Mdict = calcELBO_NonlinearTerms(Data=Data, LP=LP, returnMemoizedDict=1) SS.setELBOTerm('Htable', Mdict['Htable'], dims=('K', 'K')) SS.setELBOTerm('Hstart', Mdict['Hstart'], dims=('K')) if doPrecompMergeEntropy: subHstart, subHtable = HMMUtil.PrecompMergeEntropy_SpecificPairs( LP, Data, mPairIDs) SS.setMergeTerm('Hstart', subHstart, dims=('M')) SS.setMergeTerm('Htable', subHtable, dims=('M', 2, 'K')) SS.mPairIDs = np.asarray(mPairIDs) if trackDocUsage: # Track how often topic appears in a seq. with mass > thresh. DocUsage = np.zeros(K) for n in range(Data.nDoc): start = Data.doc_range[n] stop = Data.doc_range[n + 1] DocUsage += np.sum(LP['resp'][start:stop], axis=0) > 0.01 SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K') return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None): ''' Calculate the sufficient statistics for global parameter updates Only adds stats relevant for this allocModel. Other stats are added by the obsModel. Args ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) doPrecompMergeEntropy : boolean flag indicates whether to precompute ELBO terms in advance for all possible merges of pairs of components used for optional merge moves Returns ------- SS : SuffStats for K components, with field N : vector of length-K, effective number of observations assigned to each comp ''' Nvec = np.sum(LP['resp'], axis=0) SS = SuffStatBag(K=Nvec.size, D=Data.dim) SS.setField('N', Nvec, dims=('K')) if doPrecompEntropy: ElogqZ_vec = self.E_logqZ(LP) SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K')) if doPrecompMergeEntropy: # Hmerge : KxK matrix of entropies for all possible pair-wise merges # for example, if we had only 3 components {0,1,2} # Hmerge = [ 0 H(0,1) H(0,2) # 0 0 H(1,2) # 0 0 0 ] # where H(i,j) is entropy if components i and j merged. Hmerge = np.zeros((self.K, self.K)) for jj in range(self.K): compIDs = np.arange(jj + 1, self.K) Rcombo = LP['resp'][:, jj][:, np. newaxis] + LP['resp'][:, compIDs] Hmerge[jj, compIDs] = np.sum( Rcombo * np.log(Rcombo + EPS), axis=0) SS.setMergeTerm('ElogqZ', Hmerge, dims=('K', 'K')) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs): ''' Calculate sufficient statistics. Admixture models have no suff stats for allocation ''' wv = LP['word_variational'] _, K = wv.shape SS = SuffStatBag(K=K, D=Data.vocab_size) SS.setField('nDoc', Data.nDoc, dims=None) if doPrecompEntropy: SS.setELBOTerm('ElogpZ', self.E_log_pZ(Data, LP), dims='K') SS.setELBOTerm('ElogqZ', self.E_log_qZ(Data, LP), dims='K') SS.setELBOTerm('ElogpPi', self.E_log_pPI(Data, LP), dims=None) SS.setELBOTerm('ElogqPi', self.E_log_qPI(Data, LP), dims=None) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs): ''' Calculate sufficient statistics. ''' resp = LP['resp'] _, K = resp.shape SS = SuffStatBag(K=K, D=Data.get_dim()) SS.setField('nDoc', Data.nDoc, dims=None) SS.setField('sumLogVd', np.sum(LP['ElogV'], axis=0), dims='K') SS.setField('sumLog1mVd', np.sum(LP['Elog1mV'], axis=0), dims='K') if doPrecompEntropy: ElogqZ = self.E_logqZ(Data, LP) VZlocal = self.E_logpVZ_logqV(Data, LP) SS.setELBOTerm('ElogqZ', ElogqZ, dims='K') SS.setELBOTerm('VZlocal', VZlocal, dims=None) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None): ''' Calculate the sufficient statistics for global parameter updates Only adds stats relevant for this allocModel. Other stats are added by the obsModel. Args ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) doPrecompMergeEntropy : boolean flag indicates whether to precompute ELBO terms in advance for all possible merges of pairs of components used for optional merge moves Returns ------- SS : SuffStats for K components, with field N : vector of length-K, effective number of observations assigned to each comp ''' Nvec = np.sum(LP['resp'], axis=0) SS = SuffStatBag(K=Nvec.size, D=Data.dim) SS.setField('N', Nvec, dims=('K')) if doPrecompEntropy: ElogqZ_vec = self.E_logqZ(LP) SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K')) if doPrecompMergeEntropy: # Hmerge : KxK matrix of entropies for all possible pair-wise merges # for example, if we had only 3 components {0,1,2} # Hmerge = [ 0 H(0,1) H(0,2) # 0 0 H(1,2) # 0 0 0 ] # where H(i,j) is entropy if components i and j merged. Hmerge = np.zeros((self.K, self.K)) for jj in range(self.K): compIDs = np.arange(jj+1, self.K) Rcombo = LP['resp'][:,jj][:,np.newaxis] + LP['resp'][:,compIDs] Hmerge[jj,compIDs] = np.sum(Rcombo*np.log(Rcombo+EPS), axis=0) SS.setMergeTerm('ElogqZ', Hmerge, dims=('K','K')) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=0, **kwargs): ''' Compute sufficient stats for provided dataset and local params Returns ------- SS : SuffStatBag Updated fields * NodeStateCount : 2D array, nNodes x K * N : 2D array, size K x K ''' K = LP['resp'].shape[-1] V = Data.nNodes SS = SuffStatBag(K=K, D=Data.dim, V=V) # NodeStateCount_src[i,k] # Num edges assigned to topic k associated with node i as source srcResp = LP['resp'].sum(axis=2) NodeStateCount_src = Data.getSparseSrcNodeMat() * srcResp # Equivalent but slower: for loop # NodeStateCount_src = np.zeros((Data.nNodes, K)) # for i in xrange(Data.nNodes): # mask_i = Data.edges[:,0] == i # NodeStateCount_src[i,:] = srcResp[mask_i].sum(axis=0) # NodeStateCount_rcv[i,k] # Num edges assigned to topic k associated with node i as receiver rcvResp = LP['resp'].sum(axis=1) NodeStateCount_rcv = Data.getSparseRcvNodeMat() * rcvResp # Summing src counts and rcv counts gives the total SS.setField('NodeStateCount', NodeStateCount_src + NodeStateCount_rcv, dims=('V', 'K')) # Compute total atoms assigned to each cluster pair Nresp = np.sum(LP['resp'], axis=0) SS.setField('N', Nresp, dims=('K', 'K')) if doPrecompEntropy: # Remember, resp has shape nEdges x K x K # So, need to sum so we track scalar entropy, not K x K Hresp = calcLentropyAsScalar(LP) SS.setELBOTerm('Hresp', Hresp, dims=None) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs): ''' Create sufficient stats needed for global param updates Args ------- Data : bnpy data object LP : Dictionary containing the local parameters. Expected to contain: resp : Data.nObs x K array respPair : Data.nObs x K x K array (from the def. of respPair, note respPair[0,:,:] is undefined) Returns ------- SS : SuffStatBag with fields StartStateCount : A vector of length K with entry i being resp(z_{1k}) = resp[0,:] TransStateCount : A K x K matrix where TransStateCount[i,j] = sum_{n=2}^K respPair(z_{n-1,j}, z_{nk}) N : A vector of length K with entry k being sum_{n=1}^Data.nobs resp(z_{nk}) The first two of these are used by FiniteHMM.update_global_params, and the third is used by ObsModel.update_global_params. (see the documentation for information about resp and respPair) ''' resp = LP['resp'] respPair = LP['respPair'] K = resp.shape[1] startLocIDs = Data.doc_range[:-1] StartStateCount = np.sum(resp[startLocIDs], axis=0) N = np.sum(resp, axis=0) TransStateCount = np.sum(respPair, axis=0) SS = SuffStatBag(K=K, D=Data.dim) SS.setField('StartStateCount', StartStateCount, dims=('K')) SS.setField('TransStateCount', TransStateCount, dims=('K', 'K')) SS.setField('N', N, dims=('K')) if doPrecompEntropy is not None: entropy = self.elbo_entropy(Data, LP) SS.setELBOTerm('Elogqz', entropy, dims=None) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None): """ Count expected number of times each topic is used across all docs """ wv = LP["word_variational"] _, K = wv.shape # Turn dim checking off, since some stats have dim K+1 instead of K SS = SuffStatBag(K=K, D=Data.vocab_size) SS.setField("nDoc", Data.nDoc, dims=None) sumLogPi = np.sum(LP["E_logPi"], axis=0) SS.setField("sumLogPiActive", sumLogPi[:K], dims="K") SS.setField("sumLogPiUnused", sumLogPi[-1], dims=None) if "DocTopicFrac" in LP: Nmajor = LP["DocTopicFrac"] Nmajor[Nmajor < 0.05] = 0 SS.setField("Nmajor", np.sum(Nmajor, axis=0), dims="K") if doPrecompEntropy: # ---------------- Z terms SS.setELBOTerm("ElogpZ", self.E_logpZ(Data, LP), dims="K") # ---------------- Pi terms # Note: no terms needed for ElogpPI # SS already has field sumLogPi, which is sufficient for this term ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP) SS.setELBOTerm("ElogqPiConst", ElogqPiC, dims=None) SS.setELBOTerm("ElogqPiActive", ElogqPiA, dims="K") SS.setELBOTerm("ElogqPiUnused", ElogqPiU, dims=None) if doPrecompMergeEntropy: ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP) SS.setMergeTerm("ElogpZ", ElogpZMat, dims=("K", "K")) SS.setMergeTerm("ElogqPiActive", ElogqPiMat, dims=("K", "K")) SS.setMergeTerm("sumLogPiActive", sLgPiMat, dims=("K", "K")) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=0, **kwargs): ''' Compute sufficient stats for provided dataset and local params Returns ------- SS : SuffStatBag with K components and fields * sumSource : nNodes x K * sumReceiver : nNodes x K ''' V = Data.nNodes K = LP['resp'].shape[-1] SS = SuffStatBag(K=K, D=Data.dim, V=V) if 'NodeStateCount' not in LP: assert 'resp' in LP LP = self.initLPFromResp(Data, LP) SS.setField('NodeStateCount', LP['NodeStateCount'], dims=('V', 'K')) if np.allclose(LP['resp'].sum(axis=1).min(), 1.0): # If the LP fully represents all present edges, # then the NodeStateCount should as well. assert np.allclose(SS.NodeStateCount, Data.nEdges * 2) SS.setField('N', LP['N_fg'], dims=('K', )) SS.setField('scaleFactor', Data.nEdges, dims=None) if 'Ldata_bg' in LP: SS.setELBOTerm('Ldata_bg', LP['Ldata_bg'], dims=None) if doPrecompEntropy: Hresp_fg = LP['Lentropy_fg'] # = -1 * calcRlogR(LP['resp']) Hresp_bg = LP['Lentropy_bg'] SS.setELBOTerm('Hresp', Hresp_fg, dims='K') SS.setELBOTerm('Hresp_bg', Hresp_bg, dims=None) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None): ''' Count expected number of times each topic is used across all docs ''' wv = LP['word_variational'] _, K = wv.shape # Turn dim checking off, since some stats have dim K+1 instead of K SS = SuffStatBag(K=K, D=Data.vocab_size) SS.setField('nDoc', Data.nDoc, dims=None) sumLogPi = np.sum(LP['E_logPi'], axis=0) SS.setField('sumLogPiActive', sumLogPi[:K], dims='K') SS.setField('sumLogPiUnused', sumLogPi[-1], dims=None) if 'DocTopicFrac' in LP: Nmajor = LP['DocTopicFrac'] Nmajor[Nmajor < 0.05] = 0 SS.setField('Nmajor', np.sum(Nmajor, axis=0), dims='K') if doPrecompEntropy: # ---------------- Z terms SS.setELBOTerm('ElogpZ', self.E_logpZ(Data, LP), dims='K') # ---------------- Pi terms # Note: no terms needed for ElogpPI # SS already has field sumLogPi, which is sufficient for this term ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP) SS.setELBOTerm('ElogqPiConst', ElogqPiC, dims=None) SS.setELBOTerm('ElogqPiActive', ElogqPiA, dims='K') SS.setELBOTerm('ElogqPiUnused', ElogqPiU, dims=None) if doPrecompMergeEntropy: ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP) SS.setMergeTerm('ElogpZ', ElogpZMat, dims=('K','K')) SS.setMergeTerm('ElogqPiActive', ElogqPiMat, dims=('K','K')) SS.setMergeTerm('sumLogPiActive', sLgPiMat, dims=('K','K')) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None): ''' Count expected number of times each topic is used across all docs ''' wv = LP['word_variational'] _, K = wv.shape # Turn dim checking off, since some stats have dim K+1 instead of K SS = SuffStatBag(K=K, D=Data.vocab_size) SS.setField('nDoc', Data.nDoc, dims=None) sumLogPi = np.sum(LP['E_logPi'], axis=0) SS.setField('sumLogPiActive', sumLogPi[:K], dims='K') SS.setField('sumLogPiUnused', sumLogPi[-1], dims=None) if 'DocTopicFrac' in LP: Nmajor = LP['DocTopicFrac'] Nmajor[Nmajor < 0.05] = 0 SS.setField('Nmajor', np.sum(Nmajor, axis=0), dims='K') if doPrecompEntropy: # Z terms SS.setELBOTerm('ElogpZ', self.E_logpZ(Data, LP), dims='K') # Pi terms # Note: no terms needed for ElogpPI # SS already has field sumLogPi, which is sufficient for this term ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP) SS.setELBOTerm('ElogqPiConst', ElogqPiC, dims=None) SS.setELBOTerm('ElogqPiActive', ElogqPiA, dims='K') SS.setELBOTerm('ElogqPiUnused', ElogqPiU, dims=None) if doPrecompMergeEntropy: ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP) SS.setMergeTerm('ElogpZ', ElogpZMat, dims=('K','K')) SS.setMergeTerm('ElogqPiActive', ElogqPiMat, dims=('K','K')) SS.setMergeTerm('sumLogPiActive', sLgPiMat, dims=('K','K')) return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None): ''' Count expected number of times each topic is used across all docs ''' K = LP['DocTopicCount'].shape[1] SS = SuffStatBag(K=K, D=Data.vocab_size) SS.setField('nDoc', Data.nDoc, dims=None) sumLogPi = np.sum(LP['E_logPi'], axis=0) SS.setField('sumLogPiActive', sumLogPi[:K], dims='K') SS.setField('sumLogPiUnused', sumLogPi[-1], dims=None) if doPrecompEntropy: # ---------------- Z terms SS.setELBOTerm('ElogpZ', self.E_logpZ(Data, LP), dims='K') logFactData, logFactZ = self.E_logfactorialZ(Data, LP) SS.setELBOTerm('logFactData', logFactData, dims=None) SS.setELBOTerm('logFactZ', logFactZ, dims='K') # ---------------- Pi terms # Note: no terms needed for ElogpPI # SS already has field sumLogPi, which is sufficient for this term ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP) SS.setELBOTerm('ElogqPiConst', ElogqPiC, dims=None) SS.setELBOTerm('ElogqPiActive', ElogqPiA, dims='K') SS.setELBOTerm('ElogqPiUnused', ElogqPiU, dims=None) if doPrecompMergeEntropy: ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP) SS.setMergeTerm('ElogpZ', ElogpZMat, dims=('K','K')) SS.setMergeTerm('ElogqPiActive', ElogqPiMat, dims=('K','K')) SS.setMergeTerm('sumLogPiActive', sLgPiMat, dims=('K','K')) SS.setMergeTerm('logFactZ', self.memo_factorial_term_for_merge(LP, mPairIDs), dims=('K', 'K')) return SS
def calcSummaryStats(Dslice, LP=None, alpha=None, doPrecompEntropy=False, cslice=(0, None), **kwargs): """ Calculate summary from local parameters for given data slice. Parameters ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) Returns ------- SS : SuffStatBag with K components * nDoc : scalar float Counts total documents available in provided data. Also has optional ELBO field when precompELBO is True * Hvec : 1D array, size K Vector of entropy contributions from each comp. Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp' """ K = LP['DocTopicCount'].shape[1] SS = SuffStatBag(K=K, D=Dslice.dim) if cslice[1] is None: SS.setField('nDoc', Dslice.nDoc, dims=None) else: SS.setField('nDoc', cslice[1] - cslice[0], dims=None) SS.setField('nDoc', Dslice.nDoc, dims=None) if doPrecompEntropy: assert 'theta' in LP Lalloc = L_alloc(Dslice, LP, alpha=alpha) SS.setELBOTerm('L_alloc', Lalloc, dims=None) if 'nnzPerRow' in LP and LP['nnzPerRow'] == 1: SS.setELBOTerm('Hvec', 0.0, dims=None) else: Hvec = L_entropy(Dslice, LP, returnVector=1) SS.setELBOTerm('Hvec', Hvec, dims='K') return SS
def calcSummaryStats(Dslice, LP=None, alpha=None, alphaEbeta=None, doTrackTruncationGrowth=0, doPrecompEntropy=0, doPrecompMergeEntropy=0, mergePairSelection=None, mPairIDs=None, trackDocUsage=0, **kwargs): """ Calculate summary from local parameters for given data slice. Parameters ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) Returns ------- SS : SuffStatBag with K components Relevant fields * nDoc : scalar float Counts total documents available in provided data. * sumLogPi : 1D array, size K Entry k equals \sum_{d in docs} E[ \log \pi_{dk} ] * sumLogPiRem : scalar float Equals sum over docs of probability of inactive topics. Also has optional ELBO field when precompELBO is True * Hvec : 1D array, size K Vector of entropy contributions from each comp. Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp' """ if mPairIDs is None: M = 0 else: M = len(mPairIDs) K = LP['DocTopicCount'].shape[1] if 'digammaSumTheta' not in LP: digammaSumTheta = digamma(LP['theta'].sum(axis=1) + LP['thetaRem']) LP['digammaSumTheta'] = digammaSumTheta # Used for merges if 'ElogPi' not in LP: LP['ElogPiRem'] = digamma(LP['thetaRem']) - LP['digammaSumTheta'] LP['ElogPi'] = digamma(LP['theta']) - \ LP['digammaSumTheta'][:, np.newaxis] SS = SuffStatBag(K=K, D=Dslice.dim, M=M) SS.setField('nDoc', Dslice.nDoc, dims=None) SS.setField('sumLogPi', np.sum(LP['ElogPi'], axis=0), dims='K') if 'ElogPiEmptyComp' in LP: sumLogPiEmptyComp = np.sum(LP['ElogPiEmptyComp']) - \ np.sum(LP['ElogPiOrigComp']) SS.setField('sumLogPiEmptyComp', sumLogPiEmptyComp, dims=None) if doTrackTruncationGrowth: remvec = np.zeros(K) remvec[K - 1] = np.sum(LP['ElogPiRem']) SS.setField('sumLogPiRemVec', remvec, dims='K') else: SS.setField('sumLogPiRem', np.sum(LP['ElogPiRem']), dims=None) if doPrecompEntropy: Mdict = calcELBO_NonlinearTerms(Data=Dslice, LP=LP, returnMemoizedDict=1) if type(Mdict['Hresp']) == float: # SPARSE HARD ASSIGNMENTS SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None) else: SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', )) SS.setELBOTerm('slackTheta', Mdict['slackTheta'], dims='K') SS.setELBOTerm('gammalnTheta', Mdict['gammalnTheta'], dims='K') if 'ElogPiEmptyComp' in LP: SS.setELBOTerm('slackThetaEmptyComp', Mdict['slackThetaEmptyComp']) SS.setELBOTerm('gammalnThetaEmptyComp', Mdict['gammalnThetaEmptyComp']) SS.setELBOTerm('HrespEmptyComp', Mdict['HrespEmptyComp']) else: SS.setELBOTerm('gammalnSumTheta', Mdict['gammalnSumTheta'], dims=None) SS.setELBOTerm('slackThetaRem', Mdict['slackThetaRem'], dims=None) SS.setELBOTerm('gammalnThetaRem', Mdict['gammalnThetaRem'].sum(), dims=None) if doPrecompMergeEntropy: if mPairIDs is None: raise NotImplementedError("TODO: all pairs for merges") m_Hresp = calcHrespForSpecificMergePairs(LP, Dslice, mPairIDs) if m_Hresp is not None: SS.setMergeTerm('Hresp', m_Hresp, dims=('M')) m_sumLogPi = np.zeros(M) m_gammalnTheta = np.zeros(M) m_slackTheta = np.zeros(M) for m, (kA, kB) in enumerate(mPairIDs): theta_vec = LP['theta'][:, kA] + LP['theta'][:, kB] ElogPi_vec = digamma(theta_vec) - LP['digammaSumTheta'] m_gammalnTheta[m] = np.sum(gammaln(theta_vec)) m_sumLogPi[m] = np.sum(ElogPi_vec) # slack = (Ndm - theta_dm) * E[log pi_dm] slack_vec = ElogPi_vec slack_vec *= -1 * (alphaEbeta[kA] + alphaEbeta[kB]) m_slackTheta[m] = np.sum(slack_vec) SS.setMergeTerm('gammalnTheta', m_gammalnTheta, dims=('M')) SS.setMergeTerm('sumLogPi', m_sumLogPi, dims=('M')) SS.setMergeTerm('slackTheta', m_slackTheta, dims=('M')) # Uncomment this for verification of merge calculations. # for (kA, kB) in mPairIDs: # self.verifySSForMergePair(Data, SS, LP, kA, kB) # .... end merge computations # Selection terms (computes doc-topic correlation) if mergePairSelection is not None: if mergePairSelection.count('corr') > 0: Tmat = LP['DocTopicCount'] SS.setSelectionTerm('DocTopicPairMat', np.dot(Tmat.T, Tmat), dims=('K', 'K')) SS.setSelectionTerm('DocTopicSum', np.sum(Tmat, axis=0), dims='K') if trackDocUsage: # Track num of times a topic appears nontrivially in a doc DocUsage = np.sum(LP['DocTopicCount'] > 0.01, axis=0) SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K') Pi = LP['theta'] / LP['theta'].sum(axis=1)[:, np.newaxis] SumPi = np.sum(Pi, axis=0) SS.setSelectionTerm('SumPi', SumPi, dims='K') return SS
def calcSummaryStats(Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None, mergePairSelection=None, trackDocUsage=False, **kwargs): """ Calculate sufficient statistics for global updates. Parameters ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) doPrecompMergeEntropy : boolean flag indicates whether to precompute ELBO terms in advance for certain merge candidates. Returns ------- SS : SuffStatBag with K components Summarizes for this mixture model, with fields * N : 1D array, size K N[k] = expected number of items assigned to comp k Also has optional ELBO field when precompELBO is True * ElogqZ : 1D array, size K Vector of entropy contributions from each comp. ElogqZ[k] = \sum_{n=1}^N resp[n,k] log resp[n,k] Also has optional Merge field when precompMergeELBO is True * ElogqZ : 2D array, size K x K Each term is scalar entropy of merge candidate """ if mPairIDs is not None and len(mPairIDs) > 0: M = len(mPairIDs) else: M = 0 if 'resp' in LP: Nvec = np.sum(LP['resp'], axis=0) K = Nvec.size else: # Sparse assignment case Nvec = as1D(toCArray(LP['spR'].sum(axis=0))) K = LP['spR'].shape[1] if hasattr(Data, 'dim'): SS = SuffStatBag(K=K, D=Data.dim, M=M) else: SS = SuffStatBag(K=K, D=Data.vocab_size, M=M) SS.setField('N', Nvec, dims=('K')) if doPrecompEntropy: Mdict = calcELBO_NonlinearTerms(LP=LP, returnMemoizedDict=1) if type(Mdict['Hresp']) == float: # SPARSE HARD ASSIGNMENTS SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None) else: SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', )) if doPrecompMergeEntropy: m_Hresp = None if 'resp' in LP: m_Hresp = -1 * NumericUtil.calcRlogR_specificpairs( LP['resp'], mPairIDs) elif 'spR' in LP: if LP['nnzPerRow'] > 1: m_Hresp = calcSparseMergeRlogR(spR_csr=LP['spR'], nnzPerRow=LP['nnzPerRow'], mPairIDs=mPairIDs) else: raise ValueError("Need resp or spR in LP") if m_Hresp is not None: assert m_Hresp.size == len(mPairIDs) SS.setMergeTerm('Hresp', m_Hresp, dims=('M')) if trackDocUsage: Usage = np.sum(LP['resp'] > 0.01, axis=0) SS.setSelectionTerm('DocUsageCount', Usage, dims='K') return SS
def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, doPrecompMergeEntropy=None, mPairIDs=None, trackDocUsage=0, preselectroutine=None, **kwargs): ''' Calculate sufficient statistics. ''' resp = LP['resp'] _, K = resp.shape SS = SuffStatBag(K=K, D=Data.get_dim()) SS.setField('nDoc', Data.nDoc, dims=None) SS.setField('sumLogPi', np.sum(LP['ElogPi'], axis=0), dims='K') SS.setField('sumLogPiRem', np.sum(LP['ElogPiRem']), dims=None) if doPrecompEntropy: ElogqZ = self.E_logqZ(Data, LP) SS.setELBOTerm('ElogqZ', ElogqZ, dims='K') slack_NmT, slack_NmT_Rem = self.slack_NminusTheta(LP) SS.setELBOTerm('slackNminusTheta', slack_NmT, dims='K') SS.setELBOTerm('slackNminusTheta_Rem', slack_NmT_Rem, dims=None) glnSumTheta, glnTheta, glnThetaRem = self.c_Dir_theta__parts(LP) SS.setELBOTerm('gammalnSumTheta', glnSumTheta, dims=None) SS.setELBOTerm('gammalnTheta', glnTheta, dims='K') SS.setELBOTerm('gammalnTheta_Rem', glnThetaRem, dims=None) ## Merge Term caching if doPrecompMergeEntropy: if mPairIDs is None: raise NotImplementedError("TODO: all pairs for merges") ElogqZMat = self.calcElogqZForMergePairs(LP['resp'], Data, mPairIDs) SS.setMergeTerm('ElogqZ', ElogqZMat, dims=('K', 'K')) alphaEbeta = self.alpha_E_beta() sumLogPi = np.zeros((SS.K, SS.K)) gammalnTheta = np.zeros((SS.K, SS.K)) slack_NmT = np.zeros((SS.K, SS.K)) for (kA, kB) in mPairIDs: theta_vec = LP['theta'][:, kA] + LP['theta'][:, kB] ElogPi_vec = digamma(theta_vec) - LP['digammaSumTheta'] gammalnTheta[kA, kB] = np.sum(gammaln(theta_vec)) sumLogPi[kA, kB] = np.sum(ElogPi_vec) ElogPi_vec *= alphaEbeta[kA] + alphaEbeta[kB] slack_NmT[kA, kB] = -1 * np.sum(ElogPi_vec) SS.setMergeTerm('gammalnTheta', gammalnTheta, dims=('K', 'K')) SS.setMergeTerm('sumLogPi', sumLogPi, dims=('K', 'K')) SS.setMergeTerm('slackNminusTheta', slack_NmT, dims=('K', 'K')) #for (kA, kB) in mPairIDs: # self.verifySSForMergePair(Data, SS, LP, kA, kB) ## Selection terms (computes doc-topic correlation) if preselectroutine is not None: if preselectroutine.count('corr') > 0: Tmat = LP['DocTopicCount'] SS.setSelectionTerm('DocTopicPairMat', np.dot(Tmat.T, Tmat), dims=('K', 'K')) SS.setSelectionTerm('DocTopicSum', np.sum(Tmat, axis=0), dims='K') if trackDocUsage: ## Track the number of times a topic appears with "significant mass" in a document DocUsage = np.sum(LP['DocTopicCount'] > 0.01, axis=0) SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K') return SS