def _testInferenceFromHandcraftedExampleWithKEqualingQ(self): print ("Fully handcrafted example, K=Q") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 6 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W lmda = np.zeros((D,K)) X = np.zeros((D,F)) for d in range(D): for _ in range(3): lmda[d,rd.randint(K)] += 1./3 for _ in range(int(F/3)): X[d,rd.randint(F)] += 1 A = rd.random((K,F)) X = lmda.dot(la.pinv(A).T) X = ssp.csr_matrix(X) tpcs = lmda docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,)) print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,)) print("End of Test")
def testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, _, _, _, X, W = self._sampleFromModel() D, T, K, Q, F, P = X.shape[0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for fold in range(folds): start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet,:], W[trainSet,:] X_query, W_query = X[querySet,:], W[querySet,:] modelState = newVbModelState(K, Q, F, P, T) modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100) trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState) queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval = 10, plotInterval = 100) querySetLikely = log_likelihood(modelState, X_query, W_query, queryState) print("Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely)) print("") print("End of Test")
def _testInferenceFromHandcraftedExample(self): print ("Partially hand-crafted example") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 10 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # Determine what A, U, Y and V should be U = rd.random((K,Q)) Y = rd.random((Q,P)) V = rd.random((F,P)) A = U.dot(Y).dot(V.T) # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*P)]).reshape(D,P) X = ssp.csr_matrix(X_low.dot(V.T)) lmda_low = X_low.dot(Y.T) print ("lmda_low.mean() = %f" % (lmda_low.mean())) tpcs = rowwise_softmax (lmda_low) docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 10, iterations=10) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print("Handcrafted Test-Case") print("=====================================================================") print("Average, squared, per-element difference between true and estimated:") print(" Topic Distribution: %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs),)) print(" Vocab Distribution: %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab),)) print("Average absolute difference between true and reconstructed documents") print(" Documents: %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()),)) print("End of Test")
def _testInferenceOnModelDerivedData(self): print("Model derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, tpcs, _, _, X, W = self._sampleFromModel() D = X.shape[0] (trainedState, queryState) = train(modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(np.log(queryState.expLmda)) # why safe? W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train(modelState, X, W, logInterval=1, plotInterval=50, iterations=200) tpcs_inf = rowwise_softmax(np.log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) print("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError, )) print("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D, )) print(str(tpcs[4, :])) print(str(tpcs_inf[4, :])) print("End of Test")
def testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, _, _, _, X, W = self._sampleFromModel() D, T, K, Q, F, P = X.shape[ 0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for fold in range(folds): start = fold * foldSize end = start + trainSize trainSet = np.arange(start, end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet, :], W[trainSet, :] X_query, W_query = X[querySet, :], W[querySet, :] modelState = newVbModelState(K, Q, F, P, T) modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100) trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState) queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval=10, plotInterval=100) querySetLikely = log_likelihood(modelState, X_query, W_query, queryState) print( "Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely)) print("") print("End of Test")
def _testInferenceOnModelDerivedData(self): print("Model derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, tpcs, _, _, X, W = self._sampleFromModel() D = X.shape[0] (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(np.log(queryState.expLmda)) # why safe? W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 50, iterations=200) tpcs_inf = rowwise_softmax(np.log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,)) print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,)) print (str(tpcs[4,:])) print (str(tpcs_inf[4,:])) print("End of Test")
def _testInferenceFromHandcraftedExampleWithKEqualingQ(self): print("Fully handcrafted example, K=Q") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 6 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W lmda = np.zeros((D, K)) X = np.zeros((D, F)) for d in range(D): for _ in range(3): lmda[d, rd.randint(K)] += 1. / 3 for _ in range(int(F / 3)): X[d, rd.randint(F)] += 1 A = rd.random((K, F)) X = lmda.dot(la.pinv(A).T) X = ssp.csr_matrix(X) tpcs = lmda docLens = rd.poisson(avgWordsPerDoc, (D, )) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train(modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train(modelState, X, W, logInterval=1, plotInterval=100, iterations=130) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) print("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError, )) print("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D, )) print("End of Test")
def _testInferenceFromHandcraftedExample(self): print("Partially hand-crafted example") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 10 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # Determine what A, U, Y and V should be U = rd.random((K, Q)) Y = rd.random((Q, P)) V = rd.random((F, P)) A = U.dot(Y).dot(V.T) # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D * P)]).reshape(D, P) X = ssp.csr_matrix(X_low.dot(V.T)) lmda_low = X_low.dot(Y.T) print("lmda_low.mean() = %f" % (lmda_low.mean())) tpcs = rowwise_softmax(lmda_low) docLens = rd.poisson(avgWordsPerDoc, (D, )) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train(modelState, X, W, logInterval=1, plotInterval=10, iterations=10) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) print("Handcrafted Test-Case") print( "=====================================================================" ) print( "Average, squared, per-element difference between true and estimated:" ) print(" Topic Distribution: %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs), )) print(" Vocab Distribution: %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab), )) print( "Average absolute difference between true and reconstructed documents" ) print(" Documents: %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()), )) print("End of Test")