コード例 #1
0
 def _testInferenceFromHandcraftedExample(self):
     print ("Partially hand-crafted example")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 10  # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # Determine what A, U, Y and V should be
     U = rd.random((K,Q))
     Y = rd.random((Q,P))
     V = rd.random((F,P))
     A = U.dot(Y).dot(V.T)
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*P)]).reshape(D,P)
     X     = ssp.csr_matrix(X_low.dot(V.T))
     
     lmda_low = X_low.dot(Y.T)
     print ("lmda_low.mean() = %f" % (lmda_low.mean()))
     tpcs = rowwise_softmax (lmda_low)
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 10, iterations=10)
     
     tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
             
     print("Handcrafted Test-Case")
     print("=====================================================================")
     print("Average, squared, per-element difference between true and estimated:")
     print("    Topic Distribution:    %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs),))
     print("    Vocab Distribution:    %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab),))
     print("Average absolute difference between true and reconstructed documents")
     print("    Documents:             %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()),))
     
     
     print("End of Test")
コード例 #2
0
 def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
     print ("Fully handcrafted example, K=Q")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 6   # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     lmda = np.zeros((D,K))
     X    = np.zeros((D,F))
     for d in range(D):
         for _ in range(3):
             lmda[d,rd.randint(K)] += 1./3
         for _ in range(int(F/3)):
             X[d,rd.randint(F)] += 1
     
     A = rd.random((K,F))
     X = lmda.dot(la.pinv(A).T)
     X = ssp.csr_matrix(X)
     
     tpcs = lmda
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.square(W - W_inf)) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,))
     
     print("End of Test")
コード例 #3
0
 def _sampleFromModel(self, D=200, T=100, K=10, Q=6, F=12, P=8, avgWordsPerDoc = 500):
     '''
     Create a test dataset according to the model
     
     Params:
         T - Vocabulary size, the number of "terms". Must be a square number
         Q - Latent Topics:
         K - Observed topics
         P - Latent features
         F - Observed features
         D - Sample documents (each with associated features)
         avgWordsPerDoc - average number of words per document generated (Poisson)
     
     Returns:
         modelState - a model state object configured for training
         tpcs       - the matrix of per-document topic distribution
         vocab      - the matrix of per-topic word distributions
         docLens    - the vector of document lengths
         X          - the DxF side information matrix
         W          - The DxW word matrix
     '''
     
     # Generate vocab
     beta = 0.1
     betaVec = np.ndarray((T,))
     betaVec.fill(beta)
     vocab = np.zeros((K,T))
     for k in range(K):
         vocab[k,:] = rd.dirichlet(betaVec)
     
     # Generate U, then V, then A
     tau = 0.1
     tsq = tau * tau
     (vSdRow, vSdCol) = (5.0, 5.0)
     (uSdRow, uSdCol) = (5.0, tau**2) # For the K-dimensions we use tsq
     (ySdRow, ySdCol) = (5.0, 5.0)
     (aSdRow, aSdCol) = (5.0, tau**2)
     
     U = matrix_normal(np.zeros((K,Q)),   uSdRow * np.eye(Q), uSdCol * np.eye(K))
     Y = matrix_normal(np.zeros((Q,P)),   ySdRow * np.eye(P), ySdCol * np.eye(Q))
     V = matrix_normal(np.zeros((F,P)),   vSdRow * np.eye(P), vSdCol * np.eye(F))
     A = matrix_normal(U.dot(Y).dot(V.T), aSdRow * np.eye(F), aSdCol * np.eye(K))
     
     # Generate the input features. Assume the features are multinomial and sparse
     # (not quite a perfect match for the twitter example: twitter is binary, this 
     # may not be)
     featuresDist  = [1. / P] * P
     maxNonZeroFeatures = 3
     
     X_low = np.zeros((D,P), dtype=np.float32)
     for d in range(D):
         X_low[d,:] = rd.multinomial(maxNonZeroFeatures, featuresDist)
     X = np.round(X_low.dot(V.T))
     X = ssp.csr_matrix(X)
     
     # Use the features and the matrix A to generate the topics and documents
     tpcs = rowwise_softmax (X.dot(A.T))
     
     docLens = rd.poisson(avgWordsPerDoc, (D,)).astype(np.float32)
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     # Initialise the model
     modelState = newVbModelState(K, Q, F, P, T)
     
     # Return the initialised model, the true parameter values, and the
     # generated observations
     return modelState, tpcs, vocab, docLens, X, W
コード例 #4
0
 def testLikelihoodOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     
     TRAIN_ITERS=1000
     QUERY_ITERS=100
     
     rd.seed(0xBADB055) # Global init for repeatable test
     modelState, _, _, _, X, W = self._sampleFromModel()
     D, T, K, Q, F, P = X.shape[0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     beforeLikelies = np.zeros((folds,))
     afterLikelies  = np.zeros((folds,))
     
     for fold in range(folds):
         start = fold * foldSize
         end   = start + trainSize
         
         trainSet = np.arange(start,end) % D
         querySet = np.arange(end, end + querySize) % D
         
         X_train, W_train = X[trainSet,:], W[trainSet,:]
         X_query, W_query = X[querySet,:], W[querySet,:]
         
         # Run a single training run and figure out what the held-out
         # likelihood is
         modelState = newVbModelState(K, Q, F, P, T)
         modelState, queryState = train(modelState, X_train, W_train, iterations=1, logInterval=1, plotInterval=TRAIN_ITERS)
         
         queryState = query(modelState, X_query, W_query, iterations=QUERY_ITERS, epsilon=0.001, logInterval = 1, plotInterval = QUERY_ITERS)
         querySetLikely = log_likelihood(modelState, X_query, W_query, queryState)
         beforeLikelies[fold] = querySetLikely
         
         # Now complete the training run and figure out what the held-out
         # likelihood is
         modelState, queryState = train(modelState, X_train, W_train, iterations=TRAIN_ITERS, logInterval=1, plotInterval=TRAIN_ITERS)
         trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState)
         
         queryState = query(modelState, X_query, W_query, iterations=QUERY_ITERS, epsilon=0.001, logInterval = 1, plotInterval = QUERY_ITERS)
         querySetLikely = log_likelihood(modelState, X_query, W_query, queryState)
         afterLikelies[fold] = querySetLikely
         
         print("Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely))
     
     ind = np.arange(folds)
     fig, ax = plt.subplots()
     width = 0.35
     rects1 = ax.bar(ind, beforeLikelies, width, color='r')
     rects2 = ax.bar(ind + width, afterLikelies, width, color='g')
     
     ax.set_ylabel('Held-out Likelihood')
     ax.set_title('Held-out likelihood')
     ax.set_xticks(ind+width)
     ax.set_xticklabels([ "Fold-" + str(f) for f in ind])
     ax.set_xlabel("Fold")
     
     ax.legend( (rects1[0], rects2[0]), ('Before Training', 'After Training') )
     
     plt.show()
     
     print("End of Test")
コード例 #5
0
    def _sampleFromModel(self,
                         D=200,
                         T=100,
                         K=10,
                         Q=6,
                         F=12,
                         P=8,
                         avgWordsPerDoc=500):
        '''
        Create a test dataset according to the model
        
        Params:
            T - Vocabulary size, the number of "terms". Must be a square number
            Q - Latent Topics:
            K - Observed topics
            P - Latent features
            F - Observed features
            D - Sample documents (each with associated features)
            avgWordsPerDoc - average number of words per document generated (Poisson)
        
        Returns:
            modelState - a model state object configured for training
            tpcs       - the matrix of per-document topic distribution
            vocab      - the matrix of per-topic word distributions
            docLens    - the vector of document lengths
            X          - the DxF side information matrix
            W          - The DxW word matrix
        '''

        # Generate vocab
        beta = 0.1
        betaVec = np.ndarray((T, ))
        betaVec.fill(beta)
        vocab = np.zeros((K, T))
        for k in range(K):
            vocab[k, :] = rd.dirichlet(betaVec)

        # Generate U, then V, then A
        tau = 0.1
        tsq = tau * tau
        (vSdRow, vSdCol) = (5.0, 5.0)
        (uSdRow, uSdCol) = (5.0, tau**2)  # For the K-dimensions we use tsq
        (ySdRow, ySdCol) = (5.0, 5.0)
        (aSdRow, aSdCol) = (5.0, tau**2)

        U = matrix_normal(np.zeros((K, Q)), uSdRow * np.eye(Q),
                          uSdCol * np.eye(K))
        Y = matrix_normal(np.zeros((Q, P)), ySdRow * np.eye(P),
                          ySdCol * np.eye(Q))
        V = matrix_normal(np.zeros((F, P)), vSdRow * np.eye(P),
                          vSdCol * np.eye(F))
        A = matrix_normal(
            U.dot(Y).dot(V.T), aSdRow * np.eye(F), aSdCol * np.eye(K))

        # Generate the input features. Assume the features are multinomial and sparse
        # (not quite a perfect match for the twitter example: twitter is binary, this
        # may not be)
        featuresDist = [1. / P] * P
        maxNonZeroFeatures = 3

        X_low = np.zeros((D, P), dtype=np.float32)
        for d in range(D):
            X_low[d, :] = rd.multinomial(maxNonZeroFeatures, featuresDist)
        X = np.round(X_low.dot(V.T))
        X = ssp.csr_matrix(X)

        # Use the features and the matrix A to generate the topics and documents
        tpcs = rowwise_softmax(X.dot(A.T))

        docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32)
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        # Initialise the model
        modelState = newVbModelState(K, Q, F, P, T)

        # Return the initialised model, the true parameter values, and the
        # generated observations
        return modelState, tpcs, vocab, docLens, X, W
コード例 #6
0
    def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
        print("Fully handcrafted example, K=Q")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 6  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        lmda = np.zeros((D, K))
        X = np.zeros((D, F))
        for d in range(D):
            for _ in range(3):
                lmda[d, rd.randint(K)] += 1. / 3
            for _ in range(int(F / 3)):
                X[d, rd.randint(F)] += 1

        A = rd.random((K, F))
        X = lmda.dot(la.pinv(A).T)
        X = ssp.csr_matrix(X)

        tpcs = lmda

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           iterations=1)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)
        priorReconsError = np.sum(np.square(W - W_inf)) / D

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=100,
                                           iterations=130)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Model Driven: Prior Reconstruction Error: %f" %
              (priorReconsError, ))
        print("Model Driven: Final Reconstruction Error: %f" %
              (np.sum(np.square(W - W_inf)) / D, ))

        print("End of Test")
コード例 #7
0
    def _testInferenceFromHandcraftedExample(self):
        print("Partially hand-crafted example")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 10  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # Determine what A, U, Y and V should be
        U = rd.random((K, Q))
        Y = rd.random((Q, P))
        V = rd.random((F, P))
        A = U.dot(Y).dot(V.T)

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        X_low = np.array([1 if rd.random() < 0.3 else 0
                          for _ in range(D * P)]).reshape(D, P)
        X = ssp.csr_matrix(X_low.dot(V.T))

        lmda_low = X_low.dot(Y.T)
        print("lmda_low.mean() = %f" % (lmda_low.mean()))
        tpcs = rowwise_softmax(lmda_low)

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)
        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=10,
                                           iterations=10)

        tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Handcrafted Test-Case")
        print(
            "====================================================================="
        )
        print(
            "Average, squared, per-element difference between true and estimated:"
        )
        print("    Topic Distribution:    %f" %
              (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs), ))
        print("    Vocab Distribution:    %f" %
              (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) /
               len(vocab), ))
        print(
            "Average absolute difference between true and reconstructed documents"
        )
        print("    Documents:             %f" %
              (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()), ))

        print("End of Test")
コード例 #8
0
    def testLikelihoodOnModelDerivedExample(self):
        print("Cross-validated likelihoods on model-derived example")

        TRAIN_ITERS = 1000
        QUERY_ITERS = 100

        rd.seed(0xBADB055)  # Global init for repeatable test
        modelState, _, _, _, X, W = self._sampleFromModel()
        D, T, K, Q, F, P = X.shape[
            0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P

        # Create the cross-validation folds
        folds = 5
        foldSize = ceil(D / 5)
        querySize = foldSize
        trainSize = D - querySize

        beforeLikelies = np.zeros((folds, ))
        afterLikelies = np.zeros((folds, ))

        for fold in range(folds):
            start = fold * foldSize
            end = start + trainSize

            trainSet = np.arange(start, end) % D
            querySet = np.arange(end, end + querySize) % D

            X_train, W_train = X[trainSet, :], W[trainSet, :]
            X_query, W_query = X[querySet, :], W[querySet, :]

            # Run a single training run and figure out what the held-out
            # likelihood is
            modelState = newVbModelState(K, Q, F, P, T)
            modelState, queryState = train(modelState,
                                           X_train,
                                           W_train,
                                           iterations=1,
                                           logInterval=1,
                                           plotInterval=TRAIN_ITERS)

            queryState = query(modelState,
                               X_query,
                               W_query,
                               iterations=QUERY_ITERS,
                               epsilon=0.001,
                               logInterval=1,
                               plotInterval=QUERY_ITERS)
            querySetLikely = log_likelihood(modelState, X_query, W_query,
                                            queryState)
            beforeLikelies[fold] = querySetLikely

            # Now complete the training run and figure out what the held-out
            # likelihood is
            modelState, queryState = train(modelState,
                                           X_train,
                                           W_train,
                                           iterations=TRAIN_ITERS,
                                           logInterval=1,
                                           plotInterval=TRAIN_ITERS)
            trainSetLikely = log_likelihood(modelState, X_train, W_train,
                                            queryState)

            queryState = query(modelState,
                               X_query,
                               W_query,
                               iterations=QUERY_ITERS,
                               epsilon=0.001,
                               logInterval=1,
                               plotInterval=QUERY_ITERS)
            querySetLikely = log_likelihood(modelState, X_query, W_query,
                                            queryState)
            afterLikelies[fold] = querySetLikely

            print(
                "Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f"
                % (fold, trainSetLikely, querySetLikely))

        ind = np.arange(folds)
        fig, ax = plt.subplots()
        width = 0.35
        rects1 = ax.bar(ind, beforeLikelies, width, color='r')
        rects2 = ax.bar(ind + width, afterLikelies, width, color='g')

        ax.set_ylabel('Held-out Likelihood')
        ax.set_title('Held-out likelihood')
        ax.set_xticks(ind + width)
        ax.set_xticklabels(["Fold-" + str(f) for f in ind])
        ax.set_xlabel("Fold")

        ax.legend((rects1[0], rects2[0]),
                  ('Before Training', 'After Training'))

        plt.show()

        print("End of Test")