def estimatePiAndDiv_ManyDocs(Data, obsModel, Mu, Pi=None, k=None, alpha=1.0, optim_method='frankwolfe', doActiveOnly=True, DivDataVec=None, smoothVec='lam', maxiter=100, minDiv=None): ''' Estimate doc-topic probs for many docs, with corresponding divergence Returns ------- Pi : 2D array, size D x K minDiv : 1D array, size D minDiv[d] : divergence from closest convex combination of topics in Mu ''' K = len(Mu) if k is None: k = K if isinstance(Mu, list): topics = np.vstack(Mu[:k]) else: topics = Mu[:k] if Pi is None: Pi = np.ones((Data.nDoc, K)) if minDiv is None: minDiv = np.zeros(Data.nDoc) for d in range(Data.nDoc): start_d = Data.doc_range[d] stop_d = Data.doc_range[d + 1] wids_d = Data.word_id[start_d:stop_d] wcts_d = Data.word_count[start_d:stop_d] if doActiveOnly: activeIDs_d = np.flatnonzero(Pi[d, :k] > .01) if activeIDs_d[-1] != k - 1: activeIDs_d = np.append(activeIDs_d, k - 1) else: activeIDs_d = np.arange(k) assert activeIDs_d.size >= 1 assert activeIDs_d.size <= k topics_d = topics[activeIDs_d, :] assert topics_d.shape[0] <= k initpiVec_d = Pi[d, activeIDs_d].copy() initpiVec_d[-1] = 0.1 initpiVec_d[:-1] *= 0.9 initpiVec_d /= initpiVec_d.sum() assert np.allclose(initpiVec_d.sum(), 1.0) if optim_method == 'frankwolfe': piVec_d = estimatePiForDoc_frankwolfe(ids_U=wids_d, cts_U=wcts_d, topics_KV=topics_d, initpiVec_K=initpiVec_d, alpha=alpha, seed=(k * 101 + d), maxiter=maxiter, returnFuncValAndInfo=False, verbose=False) piVec_d *= Pi[d, activeIDs_d[:-1]].sum() Pi[d, activeIDs_d] = piVec_d else: Pi[d, :k], _, _ = estimatePiForDoc_graddescent(ids_d=wids_d, cts_d=wcts_d, topics=topics, alpha=alpha, scale=1.0, piInit=None) assert np.allclose(Pi[d, :k].sum(), 1.0) minDiv[d] = -1 * np.inner(wcts_d, np.log(np.dot(Pi[d, :k], topics[:, wids_d]))) minDiv_check = -1 * np.sum( Data.getDocTypeCountMatrix() * np.log(np.dot(Pi[:, :k], topics)), axis=1) assert np.allclose(minDiv, minDiv_check) if isinstance(smoothVec, str) and smoothVec.count('lam'): minDiv -= np.dot(np.log(np.dot(Pi[:, :k], topics)), obsModel.Prior.lam) elif isinstance(smoothVec, np.ndarray): minDiv -= np.dot(np.log(np.dot(Pi[:, :k], topics)), smoothVec) if DivDataVec is not None: minDiv += DivDataVec assert np.min(minDiv) > -1e-6 np.maximum(minDiv, 0, out=minDiv) return Pi, minDiv
topics_KV=topics_KV, alpha=alpha, seed=d) fwDTC_K, fwLtrace = calcLocalParamsWithELBOTraceForSingleDoc( initDocTopicProb_K=fwpi_K, logLik_UK=logLik_UK, cts_U=cts_U, alphaEbeta_K=alphaEbeta_K, convThrLP=convThrLP, nCoordAscentItersLP=nCoordAscentItersLP) pylab.plot(fwLtrace, 'b-', label='frankwolfeMAP', linewidth=2); assert isMonotonicIncreasing(fwLtrace) natpi_K, _, _ = estimatePiForDoc_graddescent( ids_U=ids_U, cts_U=cts_U, topics_KV=topics_KV, alpha=alpha, ) natDTC_K, natLtrace = calcLocalParamsWithELBOTraceForSingleDoc( initDocTopicProb_K=natpi_K, logLik_UK=logLik_UK, cts_U=cts_U, alphaEbeta_K=alphaEbeta_K, convThrLP=convThrLP, nCoordAscentItersLP=nCoordAscentItersLP) pylab.plot(natLtrace, 'g-', label='naturalMAP', linewidth=2); assert isMonotonicIncreasing(natLtrace) restartDTC_K, restartLtrace = calcLocalParamsWithELBOTraceForSingleDoc( logLik_UK=logLik_UK, cts_U=cts_U,