def testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, _, _, _, X, W = sampleFromModel() D, T, K, Q, F, P = X.shape[0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for fold in range(folds): start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet,:], W[trainSet,:] X_query, W_query = X[querySet,:], W[querySet,:] modelState = newVbModelState(K, Q, F, P, T) modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100) trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState) queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval = 10, plotInterval = 100) querySetLikely = log_likelihood(modelState, X_query, W_query, queryState) print("Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely)) print("End of Test")
def _testInferenceFromHandcraftedExampleWithKEqualingQ(self): print ("Fully handcrafted example, K=Q") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 6 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W lmda = np.zeros((D,K)) X = np.zeros((D,F)) for d in range(D): for _ in range(3): lmda[d,rd.randint(K)] += 1./3 for _ in range(int(F/3)): X[d,rd.randint(F)] += 1 A = rd.random((K,F)) X = lmda.dot(la.pinv(A).T) X = ssp.csr_matrix(X) tpcs = lmda docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,)) print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,)) print("End of Test")
def _testInferenceFromHandcraftedExample(self): print ("Partially hand-crafted example") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 10 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # Determine what A, U, Y and V should be U = rd.random((K,Q)) Y = rd.random((Q,P)) V = rd.random((F,P)) A = U.dot(Y).dot(V.T) # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*P)]).reshape(D,P) X = ssp.csr_matrix(X_low.dot(V.T)) lmda_low = X_low.dot(Y.T) print ("lmda_low.mean() = %f" % (lmda_low.mean())) tpcs = rowwise_softmax (lmda_low) docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 10, iterations=10) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print("Handcrafted Test-Case") print("=====================================================================") print("Average, squared, per-element difference between true and estimated:") print(" Topic Distribution: %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs),)) print(" Vocab Distribution: %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab),)) print("Average absolute difference between true and reconstructed documents") print(" Documents: %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()),)) print("End of Test")
def testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, _, _, _, X, W = sampleFromModel() D, T, K, Q, F, P = X.shape[ 0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for fold in range(folds): start = fold * foldSize end = start + trainSize trainSet = np.arange(start, end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet, :], W[trainSet, :] X_query, W_query = X[querySet, :], W[querySet, :] modelState = newVbModelState(K, Q, F, P, T) modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100) trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState) queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval=10, plotInterval=100) querySetLikely = log_likelihood(modelState, X_query, W_query, queryState) print( "Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely)) print("End of Test")
def _testInferenceFromHandcraftedExampleWithKEqualingQ(self): print("Fully handcrafted example, K=Q") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 6 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W lmda = np.zeros((D, K)) X = np.zeros((D, F)) for d in range(D): for _ in range(3): lmda[d, rd.randint(K)] += 1. / 3 for _ in range(int(F / 3)): X[d, rd.randint(F)] += 1 A = rd.random((K, F)) X = lmda.dot(la.pinv(A).T) X = ssp.csr_matrix(X) tpcs = lmda docLens = rd.poisson(avgWordsPerDoc, (D, )) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train(modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train(modelState, X, W, logInterval=1, plotInterval=100, iterations=130) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) print("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError, )) print("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D, )) print("End of Test")
def sampleFromModel(D=200, T=100, K=10, Q=6, F=12, P=8, avgWordsPerDoc=500): ''' Create a test dataset according to the model Params: T - Vocabulary size, the number of "terms". Must be a square number Q - Latent Topics: K - Observed topics P - Latent features F - Observed features D - Sample documents (each with associated features) avgWordsPerDoc - average number of words per document generated (Poisson) Returns: modelState - a model state object configured for training tpcs - the matrix of per-document topic distribution vocab - the matrix of per-topic word distributions docLens - the vector of document lengths X - the DxF side information matrix W - The DxW word matrix ''' # Generate vocab beta = 0.1 betaVec = np.ndarray((T, )) betaVec.fill(beta) vocab = np.zeros((K, T)) for k in range(K): vocab[k, :] = rd.dirichlet(betaVec) # Generate U, then V, then A tau = 0.1 tsq = tau * tau (vSdRow, vSdCol) = (5.0, 5.0) (uSdRow, uSdCol) = (5.0, tau**2) # For the K-dimensions we use tsq (ySdRow, ySdCol) = (5.0, 5.0) (aSdRow, aSdCol) = (5.0, tau**2) U = np.abs( matrix_normal(np.zeros((K, Q)), uSdRow * np.eye(Q), uSdCol * np.eye(K))) Y = np.abs( matrix_normal(np.zeros((Q, P)), ySdRow * np.eye(P), ySdCol * np.eye(Q))) V = np.abs( matrix_normal(np.zeros((F, P)), vSdRow * np.eye(P), vSdCol * np.eye(F))) A = np.abs( matrix_normal( U.dot(Y).dot(V.T), aSdRow * np.eye(F), aSdCol * np.eye(K))) # Generate the input features. Assume the features are multinomial and sparse # (not quite a perfect match for the twitter example: twitter is binary, this # may not be) featuresDist = [1. / P] * P maxNonZeroFeatures = 3 X_low = np.zeros((D, P), dtype=np.float32) for d in range(D): X_low[d, :] = rd.multinomial(maxNonZeroFeatures, featuresDist) X = np.round(X_low.dot(V.T)) X = ssp.csr_matrix(X) # Use the features and the matrix A to generate the topics and documents tpcs = rowwise_softmax(X.dot(A.T)) docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # Initialise the model modelState = newVbModelState(K, Q, F, P, T) # Return the initialised model, the true parameter values, and the # generated observations return modelState, tpcs, vocab, docLens, X, W
def _testInferenceFromHandcraftedExample(self): print("Partially hand-crafted example") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 10 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # Determine what A, U, Y and V should be U = rd.random((K, Q)) Y = rd.random((Q, P)) V = rd.random((F, P)) A = U.dot(Y).dot(V.T) # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D * P)]).reshape(D, P) X = ssp.csr_matrix(X_low.dot(V.T)) lmda_low = X_low.dot(Y.T) print("lmda_low.mean() = %f" % (lmda_low.mean())) tpcs = rowwise_softmax(lmda_low) docLens = rd.poisson(avgWordsPerDoc, (D, )) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train(modelState, X, W, logInterval=1, plotInterval=10, iterations=10) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) print("Handcrafted Test-Case") print( "=====================================================================" ) print( "Average, squared, per-element difference between true and estimated:" ) print(" Topic Distribution: %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs), )) print(" Vocab Distribution: %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab), )) print( "Average absolute difference between true and reconstructed documents" ) print(" Documents: %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()), )) print("End of Test")
def sampleFromModel(D=200, T=100, K=10, Q=6, F=12, P=8, avgWordsPerDoc = 500): ''' Create a test dataset according to the model Params: T - Vocabulary size, the number of "terms". Must be a square number Q - Latent Topics: K - Observed topics P - Latent features F - Observed features D - Sample documents (each with associated features) avgWordsPerDoc - average number of words per document generated (Poisson) Returns: modelState - a model state object configured for training tpcs - the matrix of per-document topic distribution vocab - the matrix of per-topic word distributions docLens - the vector of document lengths X - the DxF side information matrix W - The DxW word matrix ''' # Generate vocab beta = 0.1 betaVec = np.ndarray((T,)) betaVec.fill(beta) vocab = np.zeros((K,T)) for k in range(K): vocab[k,:] = rd.dirichlet(betaVec) # Generate U, then V, then A tau = 0.1 tsq = tau * tau (vSdRow, vSdCol) = (5.0, 5.0) (uSdRow, uSdCol) = (5.0, tau**2) # For the K-dimensions we use tsq (ySdRow, ySdCol) = (5.0, 5.0) (aSdRow, aSdCol) = (5.0, tau**2) U = np.abs(matrix_normal(np.zeros((K,Q)), uSdRow * np.eye(Q), uSdCol * np.eye(K))) Y = np.abs(matrix_normal(np.zeros((Q,P)), ySdRow * np.eye(P), ySdCol * np.eye(Q))) V = np.abs(matrix_normal(np.zeros((F,P)), vSdRow * np.eye(P), vSdCol * np.eye(F))) A = np.abs(matrix_normal(U.dot(Y).dot(V.T), aSdRow * np.eye(F), aSdCol * np.eye(K))) # Generate the input features. Assume the features are multinomial and sparse # (not quite a perfect match for the twitter example: twitter is binary, this # may not be) featuresDist = [1. / P] * P maxNonZeroFeatures = 3 X_low = np.zeros((D,P), dtype=np.float32) for d in range(D): X_low[d,:] = rd.multinomial(maxNonZeroFeatures, featuresDist) X = np.round(X_low.dot(V.T)) X = ssp.csr_matrix(X) # Use the features and the matrix A to generate the topics and documents tpcs = rowwise_softmax (X.dot(A.T)) docLens = rd.poisson(avgWordsPerDoc, (D,)).astype(np.float32) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # Initialise the model modelState = newVbModelState(K, Q, F, P, T) # Return the initialised model, the true parameter values, and the # generated observations return modelState, tpcs, vocab, docLens, X, W