def testInferenceFromModelDerivedExample(self): rd.seed(0xDEADD0C5) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number K = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary F = 8 # Features P = 6 # Size of principle subspace D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # Generate vocab beta = 0.01 betaVec = np.ndarray((T,)) betaVec.fill(beta) vocab = np.zeros((K,T)) for k in range(K): vocab[k,:] = rd.dirichlet(betaVec) # Generate U, then V, then A vStdev = 5.0 uStdev = 5.0 aStdev = 2.0 tau = 0.1 U = matrix_normal(np.zeros((F,P)), np.eye(P), uStdev * np.eye(F)) V = matrix_normal(np.zeros((P,K)), tau**2 * np.eye(K), vStdev**2 * np.eye(P)) A = matrix_normal (U.dot(V), tau**2 * np.eye(K), aStdev**2 * np.eye(F)) # Generate the input features. Assume the features are multinomial and sparse # (almost matches the twitter example: twitter is binary, this may not be) featuresDist = [1. / F] * F maxNonZeroFeatures = 3 X = np.zeros((D,F)) for d in range(D): X[d,:] = rd.multinomial(maxNonZeroFeatures, featuresDist) # Use the features and the matrix A to generate the topics and documents A = matrix_normal (U.dot(V), tau**2 * np.eye(K), aStdev**2 * np.eye(F)) tpcs = rowwise_softmax (X.dot(A)) docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers # # Now finally try to train the model # modelState = newVbModelState(K, F, T, P) (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.abs(W - W_inf)**2) / D (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=200) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,)) print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.abs(W - W_inf)**2) / D,)) print("End of Test")
def _testInferenceFromHandcraftedExample(self): rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number K = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary F = 8 # Features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # Determine what A, U and V should be by doing PCA on # a random matrix A, then recomputing it given the decomposition A = rd.random((F,K)) * 10 (U, S, _) = la.svd (A) cdf = [sum(S[:f]) for f in range(1,F+1)] P = len ([i for i in range(F) if cdf[i] > 0.75 * sum(S)]) if P == F: raise ValueError("Can't reduce the dimension") U = U[:,:P]; V = np.ndarray((P,K)) for col in range(K): (soln, _, _, _ ) = la.lstsq(U, A[:,col]) V[:,col] = soln A = U.dot(V) # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W X = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*F)]).reshape(D,F) # X = ssp.csr_matrix(X) lmda = X.dot(A) print ("lmda.mean() = %f" % (lmda.mean())) tpcs = rowwise_softmax (lmda) docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers # # Now finally try to train the model # modelState = newVbModelState(K, F, T, P) (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=200) tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print("Handcrafted Test-Case") print("=====================================================================") print("Average, squared, per-element difference between true and estimated:") print(" Topic Distribution: %f" % (np.sum((tpcs - tpcs_inf)**2) / len(tpcs),)) print(" Vocab Distribution: %f" % (np.sum((vocab - trainedState.vocab)**2) / len(vocab),)) print("Average absolute difference between true and reconstructed documents") print(" Documents: %f" % (np.sum(np.abs(W - W_inf)) / np.sum(W),)) print("End of Test")