def _testInferenceFromHandcraftedExample(self):
     print ("Partially hand-crafted example")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 10  # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # Determine what A, U, Y and V should be
     U = rd.random((K,Q))
     Y = rd.random((Q,P))
     V = rd.random((F,P))
     A = U.dot(Y).dot(V.T)
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*P)]).reshape(D,P)
     X     = ssp.csr_matrix(X_low.dot(V.T))
     
     lmda_low = X_low.dot(Y.T)
     print ("lmda_low.mean() = %f" % (lmda_low.mean()))
     tpcs = rowwise_softmax (lmda_low)
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 10, iterations=10)
     
     tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
             
     print("Handcrafted Test-Case")
     print("=====================================================================")
     print("Average, squared, per-element difference between true and estimated:")
     print("    Topic Distribution:    %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs),))
     print("    Vocab Distribution:    %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab),))
     print("Average absolute difference between true and reconstructed documents")
     print("    Documents:             %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()),))
     
     
     print("End of Test")
 def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
     print ("Fully handcrafted example, K=Q")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 6   # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     lmda = np.zeros((D,K))
     X    = np.zeros((D,F))
     for d in range(D):
         for _ in range(3):
             lmda[d,rd.randint(K)] += 1./3
         for _ in range(int(F/3)):
             X[d,rd.randint(F)] += 1
     
     A = rd.random((K,F))
     X = lmda.dot(la.pinv(A).T)
     X = ssp.csr_matrix(X)
     
     tpcs = lmda
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.square(W - W_inf)) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,))
     
     print("End of Test")
Ejemplo n.º 3
0
    def _testInferenceOnModelDerivedData(self):
        print("Model derived example")

        rd.seed(0xBADB055)  # Global init for repeatable test
        modelState, tpcs, _, _, X, W = self._sampleFromModel()
        D = X.shape[0]

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           iterations=1)
        tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))  # why safe?
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)
        priorReconsError = np.sum(np.square(W - W_inf)) / D

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=50,
                                           iterations=200)
        tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Model Driven: Prior Reconstruction Error: %f" %
              (priorReconsError, ))
        print("Model Driven: Final Reconstruction Error: %f" %
              (np.sum(np.square(W - W_inf)) / D, ))

        print(str(tpcs[4, :]))
        print(str(tpcs_inf[4, :]))

        print("End of Test")
 def _testInferenceOnModelDerivedData(self):
     print("Model derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     modelState, tpcs, _, _, X, W = self._sampleFromModel()
     D = X.shape[0]
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(np.log(queryState.expLmda)) # why safe?
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.square(W - W_inf)) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 50, iterations=200)
     tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,))
     
     print (str(tpcs[4,:]))
     print (str(tpcs_inf[4,:]))
     
     print("End of Test")       
 def _sampleFromModel(self, D=200, T=100, K=10, Q=6, F=12, P=8, avgWordsPerDoc = 500):
     '''
     Create a test dataset according to the model
     
     Params:
         T - Vocabulary size, the number of "terms". Must be a square number
         Q - Latent Topics:
         K - Observed topics
         P - Latent features
         F - Observed features
         D - Sample documents (each with associated features)
         avgWordsPerDoc - average number of words per document generated (Poisson)
     
     Returns:
         modelState - a model state object configured for training
         tpcs       - the matrix of per-document topic distribution
         vocab      - the matrix of per-topic word distributions
         docLens    - the vector of document lengths
         X          - the DxF side information matrix
         W          - The DxW word matrix
     '''
     
     # Generate vocab
     beta = 0.1
     betaVec = np.ndarray((T,))
     betaVec.fill(beta)
     vocab = np.zeros((K,T))
     for k in range(K):
         vocab[k,:] = rd.dirichlet(betaVec)
     
     # Generate U, then V, then A
     tau = 0.1
     tsq = tau * tau
     (vSdRow, vSdCol) = (5.0, 5.0)
     (uSdRow, uSdCol) = (5.0, tau**2) # For the K-dimensions we use tsq
     (ySdRow, ySdCol) = (5.0, 5.0)
     (aSdRow, aSdCol) = (5.0, tau**2)
     
     U = matrix_normal(np.zeros((K,Q)),   uSdRow * np.eye(Q), uSdCol * np.eye(K))
     Y = matrix_normal(np.zeros((Q,P)),   ySdRow * np.eye(P), ySdCol * np.eye(Q))
     V = matrix_normal(np.zeros((F,P)),   vSdRow * np.eye(P), vSdCol * np.eye(F))
     A = matrix_normal(U.dot(Y).dot(V.T), aSdRow * np.eye(F), aSdCol * np.eye(K))
     
     # Generate the input features. Assume the features are multinomial and sparse
     # (not quite a perfect match for the twitter example: twitter is binary, this 
     # may not be)
     featuresDist  = [1. / P] * P
     maxNonZeroFeatures = 3
     
     X_low = np.zeros((D,P), dtype=np.float32)
     for d in range(D):
         X_low[d,:] = rd.multinomial(maxNonZeroFeatures, featuresDist)
     X = np.round(X_low.dot(V.T))
     X = ssp.csr_matrix(X)
     
     # Use the features and the matrix A to generate the topics and documents
     tpcs = rowwise_softmax (X.dot(A.T))
     
     docLens = rd.poisson(avgWordsPerDoc, (D,)).astype(np.float32)
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     # Initialise the model
     modelState = newVbModelState(K, Q, F, P, T)
     
     # Return the initialised model, the true parameter values, and the
     # generated observations
     return modelState, tpcs, vocab, docLens, X, W
Ejemplo n.º 6
0
    def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
        print("Fully handcrafted example, K=Q")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 6  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        lmda = np.zeros((D, K))
        X = np.zeros((D, F))
        for d in range(D):
            for _ in range(3):
                lmda[d, rd.randint(K)] += 1. / 3
            for _ in range(int(F / 3)):
                X[d, rd.randint(F)] += 1

        A = rd.random((K, F))
        X = lmda.dot(la.pinv(A).T)
        X = ssp.csr_matrix(X)

        tpcs = lmda

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           iterations=1)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)
        priorReconsError = np.sum(np.square(W - W_inf)) / D

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=100,
                                           iterations=130)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Model Driven: Prior Reconstruction Error: %f" %
              (priorReconsError, ))
        print("Model Driven: Final Reconstruction Error: %f" %
              (np.sum(np.square(W - W_inf)) / D, ))

        print("End of Test")
Ejemplo n.º 7
0
def sampleFromModel(D=200, T=100, K=10, Q=6, F=12, P=8, avgWordsPerDoc=500):
    '''
    Create a test dataset according to the model
    
    Params:
        T - Vocabulary size, the number of "terms". Must be a square number
        Q - Latent Topics:
        K - Observed topics
        P - Latent features
        F - Observed features
        D - Sample documents (each with associated features)
        avgWordsPerDoc - average number of words per document generated (Poisson)
    
    Returns:
        modelState - a model state object configured for training
        tpcs       - the matrix of per-document topic distribution
        vocab      - the matrix of per-topic word distributions
        docLens    - the vector of document lengths
        X          - the DxF side information matrix
        W          - The DxW word matrix
    '''

    # Generate vocab
    beta = 0.1
    betaVec = np.ndarray((T, ))
    betaVec.fill(beta)
    vocab = np.zeros((K, T))
    for k in range(K):
        vocab[k, :] = rd.dirichlet(betaVec)

    # Generate U, then V, then A
    tau = 0.1
    tsq = tau * tau
    (vSdRow, vSdCol) = (5.0, 5.0)
    (uSdRow, uSdCol) = (5.0, tau**2)  # For the K-dimensions we use tsq
    (ySdRow, ySdCol) = (5.0, 5.0)
    (aSdRow, aSdCol) = (5.0, tau**2)

    U = np.abs(
        matrix_normal(np.zeros((K, Q)), uSdRow * np.eye(Q),
                      uSdCol * np.eye(K)))
    Y = np.abs(
        matrix_normal(np.zeros((Q, P)), ySdRow * np.eye(P),
                      ySdCol * np.eye(Q)))
    V = np.abs(
        matrix_normal(np.zeros((F, P)), vSdRow * np.eye(P),
                      vSdCol * np.eye(F)))
    A = np.abs(
        matrix_normal(
            U.dot(Y).dot(V.T), aSdRow * np.eye(F), aSdCol * np.eye(K)))

    # Generate the input features. Assume the features are multinomial and sparse
    # (not quite a perfect match for the twitter example: twitter is binary, this
    # may not be)
    featuresDist = [1. / P] * P
    maxNonZeroFeatures = 3

    X_low = np.zeros((D, P), dtype=np.float32)
    for d in range(D):
        X_low[d, :] = rd.multinomial(maxNonZeroFeatures, featuresDist)
    X = np.round(X_low.dot(V.T))
    X = ssp.csr_matrix(X)

    # Use the features and the matrix A to generate the topics and documents
    tpcs = rowwise_softmax(X.dot(A.T))

    docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32)
    W = tpcs.dot(vocab)
    W *= docLens[:, np.newaxis]
    W = np.array(W, dtype=np.int32)  # truncate word counts to integers
    W = ssp.csr_matrix(W)

    # Initialise the model
    modelState = newVbModelState(K, Q, F, P, T)

    # Return the initialised model, the true parameter values, and the
    # generated observations
    return modelState, tpcs, vocab, docLens, X, W
Ejemplo n.º 8
0
    def _testInferenceFromHandcraftedExample(self):
        print("Partially hand-crafted example")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 10  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # Determine what A, U, Y and V should be
        U = rd.random((K, Q))
        Y = rd.random((Q, P))
        V = rd.random((F, P))
        A = U.dot(Y).dot(V.T)

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        X_low = np.array([1 if rd.random() < 0.3 else 0
                          for _ in range(D * P)]).reshape(D, P)
        X = ssp.csr_matrix(X_low.dot(V.T))

        lmda_low = X_low.dot(Y.T)
        print("lmda_low.mean() = %f" % (lmda_low.mean()))
        tpcs = rowwise_softmax(lmda_low)

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)
        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=10,
                                           iterations=10)

        tpcs_inf = rowwise_softmax(queryState.lmda)
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Handcrafted Test-Case")
        print(
            "====================================================================="
        )
        print(
            "Average, squared, per-element difference between true and estimated:"
        )
        print("    Topic Distribution:    %f" %
              (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs), ))
        print("    Vocab Distribution:    %f" %
              (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) /
               len(vocab), ))
        print(
            "Average absolute difference between true and reconstructed documents"
        )
        print("    Documents:             %f" %
              (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()), ))

        print("End of Test")