Exemple #1
0
 def testInferenceFromModelDerivedExample(self):
     rd.seed(0xDEADD0C5) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     K = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     F = 8   # Features
     P = 6   # Size of principle subspace
     D = 200 # Sample documents (each with associated features) 
     avgWordsPerDoc = 500
     
     # Generate vocab
     beta = 0.01
     betaVec = np.ndarray((T,))
     betaVec.fill(beta)
     vocab = np.zeros((K,T))
     for k in range(K):
         vocab[k,:] = rd.dirichlet(betaVec)
     
     # Generate U, then V, then A
     vStdev = 5.0
     uStdev = 5.0
     aStdev = 2.0
     tau    = 0.1
     
     U = matrix_normal(np.zeros((F,P)), np.eye(P), uStdev * np.eye(F))
     V = matrix_normal(np.zeros((P,K)), tau**2 * np.eye(K), vStdev**2 * np.eye(P))
     A = matrix_normal (U.dot(V), tau**2 * np.eye(K), aStdev**2 * np.eye(F))
     
     # Generate the input features. Assume the features are multinomial and sparse
     # (almost matches the twitter example: twitter is binary, this may not be)
     featuresDist  = [1. / F] * F
     maxNonZeroFeatures = 3
     
     X = np.zeros((D,F))
     for d in range(D):
         X[d,:] = rd.multinomial(maxNonZeroFeatures, featuresDist)
     
     # Use the features and the matrix A to generate the topics and documents
     A = matrix_normal (U.dot(V), tau**2 * np.eye(K), aStdev**2 * np.eye(F))
     tpcs = rowwise_softmax (X.dot(A))
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, F, T, P)
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(queryState.lmda)
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.abs(W - W_inf)**2) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=200)
     tpcs_inf = rowwise_softmax(queryState.lmda)
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.abs(W - W_inf)**2) / D,))
     
     print("End of Test")
Exemple #2
0
    def _testInferenceFromHandcraftedExample(self):
        rd.seed(0xC0FFEE) # Global init for repeatable test
        
        T = 100 # Vocabulary size, the number of "terms". Must be a square number
        K = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
        F = 8   # Features
        D = 200 # Sample documents (each with associated features) 
        
        avgWordsPerDoc = 500
        
        # Determine what A, U and V should be by doing PCA on
        # a random matrix A, then recomputing it given the decomposition
        A = rd.random((F,K)) * 10
        (U, S, _) = la.svd (A)
        
        cdf = [sum(S[:f]) for f in range(1,F+1)]
        P = len ([i for i in range(F) if cdf[i] > 0.75 * sum(S)])
        
        if P == F: raise ValueError("Can't reduce the dimension")
        
        U = U[:,:P]; 
        V = np.ndarray((P,K))
        for col in range(K):
            (soln, _, _, _ ) = la.lstsq(U, A[:,col]) 
            V[:,col] = soln
        
        A = U.dot(V)
        
        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with 
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)
        
        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        X = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*F)]).reshape(D,F)
#        X = ssp.csr_matrix(X)
        
        lmda = X.dot(A)
        print ("lmda.mean() = %f" % (lmda.mean()))
        tpcs = rowwise_softmax (lmda)
        
        docLens = rd.poisson(avgWordsPerDoc, (D,))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32) # truncate word counts to integers
        
        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, F, T, P)
        (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=200)
        
        tpcs_inf = rowwise_softmax(queryState.lmda)
        W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
                
        print("Handcrafted Test-Case")
        print("=====================================================================")
        print("Average, squared, per-element difference between true and estimated:")
        print("    Topic Distribution:    %f" % (np.sum((tpcs - tpcs_inf)**2) / len(tpcs),))
        print("    Vocab Distribution:    %f" % (np.sum((vocab - trainedState.vocab)**2) / len(vocab),))
        print("Average absolute difference between true and reconstructed documents")
        print("    Documents:             %f" % (np.sum(np.abs(W - W_inf)) / np.sum(W),))
        
        
        print("End of Test")