def MakeData(self, N=10000): S1 = np.asarray([[100, 0], [0, 0.01]]) Sigma = np.zeros((2, 2, 4)) Sigma[:, :, 0] = S1 Sigma[:, :, 1] = RandUtil.rotateCovMat(S1, theta=np.pi / 4) Sigma[:, :, 2] = RandUtil.rotateCovMat(S1, theta=2 * np.pi / 4) Sigma[:, :, 3] = RandUtil.rotateCovMat(S1, theta=3 * np.pi / 4) self.Sigma = Sigma Xlist = list() Rlist = list() for k in range(Sigma.shape[2]): curX = RandUtil.mvnrand([0, 0], Sigma[:, :, k], N) curresp = np.zeros((N, 4)) curresp[:, k] = 1.0 Xlist.append(curX) Rlist.append(curresp) X = np.vstack(Xlist) self.Data = XData(X=X) self.trueresp = np.vstack(Rlist)
def MakeData(self, N=10000): S1 = np.asarray([[100, 0], [0, 0.01]]) Sigma = np.zeros( (2,2,4)) Sigma[:,:,0] = S1 Sigma[:,:,1] = RandUtil.rotateCovMat(S1, theta=np.pi/4) Sigma[:,:,2] = RandUtil.rotateCovMat(S1, theta=2*np.pi/4) Sigma[:,:,3] = RandUtil.rotateCovMat(S1, theta=3*np.pi/4) self.Sigma = Sigma Xlist = list() Rlist = list() for k in range(Sigma.shape[2]): curX = RandUtil.mvnrand([0,0], Sigma[:,:,k], N) curresp = np.zeros((N,4)) curresp[:,k] = 1.0 Xlist.append(curX) Rlist.append(curresp) X = np.vstack(Xlist) self.Data = XData(X=X) self.trueresp = np.vstack(Rlist)
def CreateToyDataFromMixModel(cls, seed=101, nDocTotal=None, nWordsPerDoc=None, nWordsPerDocFunc=None, beta=None, topics=None, **kwargs): ''' Generates BagOfWordsData dataset via mixture generative model. Returns ------ Data : BagOfWordsData object ''' from bnpy.util import RandUtil PRNG = np.random.RandomState(seed) K = topics.shape[0] V = topics.shape[1] # Make sure topics sum to one topics = topics / topics.sum(axis=1)[:, np.newaxis] assert K == beta.size doc_range = np.zeros(nDocTotal + 1) wordIDsPerDoc = list() wordCountsPerDoc = list() resp = np.zeros((nDocTotal, K)) Ks = list(range(K)) # startPos : tracks start index for current doc within corpus-wide # lists startPos = 0 for d in range(nDocTotal): # Draw single topic assignment for this doc k = RandUtil.choice(Ks, beta, PRNG) resp[d, k] = 1 # Draw the observed words for this doc # wordCountBins: V x 1 vector, entry v counts appearance of word v wordCountBins = RandUtil.multinomial(nWordsPerDoc, topics[k, :], PRNG) # Record word_id, word_count, doc_range wIDs = np.flatnonzero(wordCountBins > 0) wCounts = wordCountBins[wIDs] assert np.allclose(wCounts.sum(), nWordsPerDoc) wordIDsPerDoc.append(wIDs) wordCountsPerDoc.append(wCounts) doc_range[d] = startPos startPos += wIDs.size # Package up all data word_id = np.hstack(wordIDsPerDoc) word_count = np.hstack(wordCountsPerDoc) doc_range[-1] = word_count.size # Make TrueParams dict TrueParams = dict(K=K, topics=topics, beta=beta, resp=resp) Data = BagOfWordsData( word_id, word_count, doc_range, V, TrueParams=TrueParams) return Data
def CreateToyDataFromLDAModel(cls, seed=101, nDocTotal=None, nWordsPerDoc=None, nWordsPerDocFunc=None, topic_prior=None, topics=None, alpha=None, proba_K=None, **kwargs): ''' Generates BagOfWordsData dataset via LDA generative model. Returns ------ Data : BagOfWordsData object ''' if topic_prior is None: topic_prior = alpha * proba_K from bnpy.util import RandUtil K = topics.shape[0] V = topics.shape[1] # Make sure topics sum to one topics = topics / topics.sum(axis=1)[:, np.newaxis] assert K == topic_prior.size doc_range = np.zeros(nDocTotal + 1) wordIDsPerDoc = list() wordCountsPerDoc = list() Pi = np.zeros((nDocTotal, K)) respPerDoc = list() # startPos : tracks start index for current doc within corpus-wide # lists startPos = 0 for d in range(nDocTotal): # Need docseed to be type int, have non-zero value for all d docseed = (seed * d + seed) % (100000000) PRNG = np.random.RandomState(docseed) # Draw topic appearance probabilities for this document Pi[d, :] = PRNG.dirichlet(topic_prior) if nWordsPerDocFunc is not None: nWordsPerDoc = nWordsPerDocFunc(PRNG) # Draw the topic assignments for this doc # Npercomp : K-vector, Npercomp[k] counts appearance of topic k Npercomp = RandUtil.multinomial(nWordsPerDoc, Pi[d, :], PRNG) # Draw the observed words for this doc # wordCountBins: V x 1 vector, entry v counts appearance of word v wordCountBins = np.zeros(V) for k in range(K): wordCountBins += RandUtil.multinomial(Npercomp[k], topics[k, :], PRNG) # Record word_id, word_count, doc_range wIDs = np.flatnonzero(wordCountBins > 0) wCounts = wordCountBins[wIDs] assert np.allclose(wCounts.sum(), nWordsPerDoc) wordIDsPerDoc.append(wIDs) wordCountsPerDoc.append(wCounts) doc_range[d] = startPos startPos += wIDs.size # Record expected local parameters (LP) curResp = (topics[:, wIDs] * Pi[d, :][:, np.newaxis]).T respPerDoc.append(curResp) word_id = np.hstack(wordIDsPerDoc) word_count = np.hstack(wordCountsPerDoc) doc_range[-1] = word_count.size # Make TrueParams dict resp = np.vstack(respPerDoc) resp /= resp.sum(axis=1)[:, np.newaxis] TrueParams = dict(K=K, topics=topics, beta=topic_prior / topic_prior.sum(), topic_prior=topic_prior, resp=resp) Data = BagOfWordsData( word_id, word_count, doc_range, V, TrueParams=TrueParams) return Data