Esempio n. 1
0
    def _sampleFromModel(self, D=200, T=100, K=10, avgWordsPerDoc=500):
        '''
        Create a test dataset according to the model
        
        Params:
            D - Sample documents (each with associated features)
            T - Vocabulary size, the number of "terms". Must be a square number
            K - Observed topics
            avgWordsPerDoc - average number of words per document generated (Poisson)
        
        Returns:
            modelState - a model state object configured for training
            tpcs       - the matrix of per-document topic distribution
            vocab      - the matrix of per-topic word distributions
            docLens    - the vector of document lengths
            X          - the DxF side information matrix
            W          - The DxW word matrix
        '''

        # Generate vocab
        beta = 0.1
        betaVec = np.ndarray((T, ))
        betaVec.fill(beta)
        vocab = rd.dirichlet(betaVec, size=K)

        # Geneate the shared covariance matrix
        # ...no real structure in this.
        sigT = rd.random((K, K))
        sigT = sigT.dot(sigT)

        # Generate topic mean
        alpha = 1
        alphaVec = np.ndarray((K, ))
        alphaVec.fill(alpha)
        topicMean = rd.dirichlet(alphaVec)

        # Generate the actual topics.
        tpcs = rd.multivariate_normal(topicMean, sigT, size=D)
        tpcs = rowwise_softmax(tpcs)

        # Generate the corpus
        docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32)
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        # Return the initialised model, the true parameter values, and the
        # generated observations
        return tpcs, vocab, docLens, W
Esempio n. 2
0
def selfSoftDot(matrix):
    '''
    Considers the given matrix to be a collection of stacked row-vectors. 
    Returns the sum of the dot products of each row-vector and its 
    soft-max form.
    
    This words on DENSE matrices only, and it appears in this module simply
    for convenience.
    
    Uses fast, memory-efficient operations for matrices of single
    and double-precision numbers, uses fast-ish numpy code as a
    fallback, but at the cost of creating a copy of of the matrix.
    '''
    assert not np.isfortran(matrix), "Matrix is not stored in row-major form"
    if matrix.dtype == np.float64:
        return compiled.selfSoftDot_f8(matrix)
    elif matrix.dtype == np.float32:
        return compiled.selfSoftDot_f4(matrix)

    if WarnIfSlow:
        sys.stderr.write("WARNING: Slow code path triggered (selfSoftDot)")
    return np.sum(matrix * rowwise_softmax(matrix))
Esempio n. 3
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W   = data.words
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    
    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigT_diag.fill (NIW_PSI)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis,:]
            sigT = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            sigT /= (D + pseudoObsVar - K)
        else:
            sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype)
            sigT += np.diag(varcs.mean(axis=0))
           
        if diagonalPriorCov:
            diag = np.diag(sigT)
            sigT = np.diag(diag)
            isigT = np.diag(1./ diag)
        else:
            isigT = la.inv(sigT)

        # FIXME Undo debug
        sigT  = np.eye(K)
        isigT = la.inv(sigT)
        
        debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
#        print("                sigT.det = " + str(la.det(sigT)))
        
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)

        debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the Variances: var_d = (2 N_d * A + isigT)^{-1}
        varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT))
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # Update the Means
        rhs = V.copy()
        rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :])
        
#         means -= (means[:,0])[:,np.newaxis]
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)
            
            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)
            
            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))
        
                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Esempio n. 4
0
def recons_error (modelState, X, W, queryState):
    tpcs_inf = rowwise_softmax(queryState.lmda)
    W_inf    = np.array(tpcs_inf.dot(modelState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
    return np.sum(np.square(W - W_inf)) / X.shape[0]
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, docLens = queryState.means, queryState.expMeans, queryState.docLens
    K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \
        modelState.K, modelState.A, modelState.U, modelState.Y,  modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype

    debugFn = _debug_with_bound if debug else _debug_with_nothing
    W = data.words
    D = W.shape[0]

    # Necessary temp variables (notably the count of topic to word assignments
    # per topic per doc)
    isigT = la.inv(sigT)

    # Update the Variances
    varcs = 1. / ((n * (K - 1.) / K)[:, np.newaxis] + isigT.flat[::K + 1])
    debugFn(0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype,
            means, varcs, A, n)

    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    R = W.copy()
    for itr in range(iterations):
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                          out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)

        # Update the Means
        rhs = V.copy()
        rhs += n[:, np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d, :] = la.inv(isigT + n[d] * A).dot(rhs[d, :])

        debugFn(itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior,
                dtype, means, varcs, A, n)

        like = log_likelihood(data, modelState,
                              QueryState(means, expMeans, varcs, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, queryState
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, T = W.shape
    F = X.shape[1]

    # tmpNumDense = np.array([
    #     4	, 8	, 2	, 0	, 0,
    #     0	, 6	, 0	, 17, 0,
    #     12	, 13	, 1	, 7	, 8,
    #     0	, 5	, 0	, 0	, 0,
    #     0	, 6	, 0	, 0	, 44,
    #     0	, 7	, 2	, 0	, 0], dtype=np.float64).reshape((6,5))
    # tmpNum = ssp.csr_matrix(tmpNumDense)
    #
    # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10
    # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64)
    #
    # tmpResult = tmpNum.copy()
    # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight)
    #
    # print (str(tmpNum.todense()))
    # print (str(tmpDenomleft.dot(tmpDenomRight)))
    # print (str(tmpResult.todense()))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, docLens = queryState.means, queryState.docLens
    K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \
        modelState.K, modelState.A, modelState.U, modelState.Y,  modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype

    tp, fp, ltp, lfp = 1. / tv, 1. / fv, 1. / ltv, 1. / lfv  # turn variances into precisions

    # FIXME Use passed in hypers
    print("tp = %f tv=%f" % (tp, tv))
    vocabPrior = np.ones(shape=(T, ), dtype=modelState.dtype)

    # FIXME undo truncation
    F = 363
    A = A[:F, :]
    X = X[:, :F]
    U = U[:F, :]
    data = DataSet(words=W, feats=X)

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    debugFn = _debug_with_bound if debug else _debug_with_nothing

    # Initialize some working variables
    if covA is None:
        precA = (fp * ssp.eye(F) +
                 X.T.dot(X)).todense()  # As the inverse is almost always dense
        covA = la.inv(precA,
                      overwrite_a=True)  # it's faster to densify in advance
    uniqLens = np.unique(docLens)

    debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv,
            ltv, fv, lfv, vocab, vocabPrior)

    H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K)

    expMeans = means.copy()
    expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy())

    lhs = H.copy()
    rhs = expMeans.copy()
    Y_rhs = Y.copy()

    # Iterate over parameters
    for itr in range(iterations):

        # Update U, V given A
        V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y),
                              A.T.dot(U).dot(Y).T).T
        V /= V[0, 0]
        U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T),
                              A.dot(V).dot(Y.T).T).T

        # Update Y given U, V, A
        Y_rhs[:, :] = U.T.dot(A).dot(V)

        Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True)
        Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True)

        s = np.outer(Sv, Su).flatten()
        s += ltv * lfv
        np.reciprocal(s, out=s)

        M = Uu.T.dot(Y_rhs).dot(Uv)
        M *= unvec(s, row_count=M.shape[0])

        Y = Uu.dot(M).dot(Uv.T)
        debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv,
                ltv, fv, lfv, vocab, vocabPrior)

        A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means))
        debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv,
                ltv, fv, lfv, vocab, vocabPrior)

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # TODO One big sort by size, plus batch it.

        # Update the Means

        rhs[:, :] = expMeans
        rhs *= R.dot(vocab.T)
        rhs += X.dot(A) * tp
        rhs += docLens[:, np.newaxis] * means.dot(H)
        rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)
        for l in uniqLens:
            inds = np.where(docLens == l)[0]
            lhs[:, :] = l * H
            lhs[np.diag_indices_from(lhs)] += tp
            lhs[:, :] = la.inv(lhs)
            means[inds, :] = rhs[inds, :].dot(
                lhs
            )  # left and right got switched going from vectors to matrices :-/

        debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA,
                tv, ltv, fv, lfv, vocab, vocabPrior)

        # Standard deviation
        # DK        = means.shape[0] * means.shape[1]
        # newTp     = np.sum(means)
        # newTp     = (-newTp * newTp)
        # rhs[:,:]  = means
        # rhs      *= means
        # newTp     = DK * np.sum(rhs) - newTp
        # newTp    /= DK * (DK - 1)
        # newTp     = min(max(newTp, 1E-36), 1E+36)
        # tp        = 1 / newTp
        # if itr % logFrequency == 0:
        #     print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,)))))

        # Update the vocabulary
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                          out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)

        vocab *= (
            R.T.dot(expMeans)
        ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)

        debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA,
                tv, ltv, fv, lfv, vocab, vocabPrior)
        # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min()))

        # Update the vocab prior
        # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior)
        # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean()))

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv,
                                    vocab, vocabPrior, dtype, modelState.name)
            queryState = QueryState(means, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime('%X') +
                " : Iteration %d: bound %f \t Perplexity: %.2f" %
                (itr, boundValues[-1],
                 perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug:
                        printStderr("ERROR: bound degradation: %f > %f" %
                                    (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), \
        QueryState(means, expMeans, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Esempio n. 7
0
def varBound (modelState, queryState, X, W, Z = None, lnVocab = None, varA_U = None, XA = None, XTX = None):
    '''
    For a current state of the model, and the query, for given inputs, outputs the variational
    lower-bound.
    
    Params
    
    modelState - the state of the model currently
    queryState - the state of the query currently
    X          - the DxF matrix of features we're querying on, where D is the number of documents
    W          - the DxT matrix of words ("terms") we're querying on
    Z          - if this has already been calculated, it can be passed in. If not, we
                 recalculate it from the model and query states. Z is the DxKxT tensor which
                 for each document D and term T gives the proportion of those terms assigned
                 to topic K
    lnVocab    - the KxV matrix of the natural log applied to the vocabularly. Recalculated if
                 not provided
    varA_U     - the product of the column variance matrix and the matrix U. Recalculated if
                 not provided
    XA         - dot product of X and A, recalculated if not provided
    XTX        - dot product of X-transpose and X, recalculated if not provided.
    
    Returns
    The (positive) variational lower bound
    '''
    
    # Unpack the model and query state tuples for ease of use and maybe speed improvements
    (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab)
    (lmda, nu, lxi, s, docLen) = (queryState.lmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen)
    
    # Get the number of samples from the shape. Ensure that the shapes are consistent
    # with the model parameters.
    (D, Tcheck) = W.shape
    if Tcheck != T: raise ValueError ("The shape of the document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck))
    
    (Dcheck, Fcheck) = X.shape
    if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D))
    if Fcheck != F: raise ValueError ("The shape of the feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) 

    # We'll need the original xi for this and also Z, the 3D tensor of which for each document D 
    #and term T gives the strenght of topic K. We'll also need the log of the vocab dist
    xi = deriveXi (lmda, nu, s)
    
    if lnVocab is None:
        lnVocab = safe_log(vocab)
    if Z is None:
        Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV
   
    
    # lnProb1 is the bound on E[p(W|Theta)]. This is a bound, not an equality as we're using
    # Bouchard's softmax bound (NIPS 2007) here. That said, most of the subsequent terms
    # will discard additive constants, so strictly speaking none of them are equalities
    docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi
    
    lnProb1 = 0.0
    lnProb1 -= np.sum(docLenLmdaLxi * lmda)
    lnProb1 -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi)
    lnProb1 += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi)
    lnProb1 -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda)
    lnProb1 += np.sum (lmda * np.einsum ('dt,dkt->dk', W, Z))
    
    lnProb1 += np.sum(lnVocab * np.einsum('dt,dkt->kt', W, Z))
    lnProb1 -= np.sum(W * np.einsum('dkt->dt', safe_x_log_x(Z)))
    
    lnProb1 -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2))
    lnProb1 += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi))
    lnProb1 -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi))
        
    # lnProb2 is E[p(Theta|A)]
    if XA is None:
        XA = X.dot(A)
    if XTX is None:
        XTX = X.T.dot(X)
    sig2  = sigma * sigma
    tau2  = tau * tau
    
    lnProb2 = -0.5 * D * K * log (sig2) \
            -  0.5 / sig2 * (np.sum(nu) + D*K * tau2 * np.sum(XTX * varA) + np.sum((lmda - XA)**2))
    
    # lnProb3 is E[p(A|V)]
    if varA_U is None:
        varA_U = varA.dot(U)
        
    lnProb3 = -0.5 * K * F * log (2 * pi) \
          -0.5 * K * F * log(tau2) \
          -0.5 / tau2 * \
          ( \
          np.trace(varA)*K*tau2 \
          + np.sum(varA_U * U) * K * tau2  \
          + np.sum((A - U.dot(V)) ** 2) \
          )
          
    # lnProb4 is E[p(V)]
    lnProb4 = -0.5 * (np.trace(varV) * K * tau2 + np.sum(V*V))
    
    # ent1 is H[q(Theta)]
    ent1 = 0.5 * np.sum (np.log(nu * nu))
    
    # ent2 is H[q(A|V)]
    ent2 = 0.5 * F * K + log(2 * pi * e) + 0.5 * K * log (la.det(varA)) + 0.5 * F * K * log (tau2)
    
    # ent3 is H[q(V)]
    ent3 = 0.5 * P * K * log (2 * pi * e) + 0.5 * K * log (la.det(varV)) + 0.5 * P * K * log (tau2)
    
    result = lnProb1 + lnProb2 + lnProb3 + lnProb4 + ent1 + ent2 + ent3
#    if (lnProb1 > 0) or (lnProb2 > 0) or (lnProb3 > 0) or (lnProb4 > 0):
#        print ("Whoopsie - lnProb > 0")
    
#    if result > 100:
#        print ("Well this is just ridiculous")
    
    return result
Esempio n. 8
0
def train(modelState, X, W, iterations=10000, epsilon=0.001, logInterval = 0):
    '''
    Creates a new query state object for a topic model based on side-information. 
    This contains all those estimated parameters that are specific to the actual
    date being queried - this must be used in conjunction with a model state.
    
    The parameters are
    
    modelState - the model state with all the model parameters
    X - the D x F matrix of side information vectors
    W - the D x V matrix of word **count** vectors.
    iterations - how long to iterate for
    epsilon - currently ignored, in future, allows us to stop early.
    
    This returns a tuple of new model-state and query-state. The latter object will
    contain X and W and also
    
    s      - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk 
    lxi    - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the 
             quadratic term xi
    lambda - the topics we've inferred for the current batch of documents
    nu     - the variance of topics we've inferred (independent)
    '''
    # Unpack the model state tuple for ease of use and maybe speed improvements
    (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab)
       
    # Get ready to plot the evolution of the likelihood
    if logInterval > 0:
        elbos = np.zeros((iterations / logInterval,))
        iters = np.zeros((iterations / logInterval,))
    
    # We'll need the total word count per doc, and total count of docs
    docLen = W.sum(axis=1)
    D      = len(docLen)
    
    # No need to recompute this every time
    XTX = X.T.dot(X)
    
    # Assign initial values to the query parameters
    lmda = rd.random((D, K))
    nu   = np.ones((D,K), np.float64)
    s    = np.zeros((D,))
    lxi  = negJakkola (np.ones((D, K), np.float64))
    
    XA = X.dot(A)
    for iteration in range(iterations):
        
        # Save repeated computation
        tsq      = tau * tau;
        tsqIP    = tsq * np.eye(P)
        trTsqIK  = K * tsq # trace of the matrix tau * tau * np.eye(K)
        halfSig2 = 1./(sigma*sigma)
        tau2sig2 = (tau * tau) / (sigma * sigma)
        
        # =============================================================
        # E-Step
        #   Model dists are q(Theta|A;Lambda;nu) q(A|V) q(V)
        #   Where lambda is the posterior mean of theta.
        # =============================================================
        
        #
        # V, varV
        varV = la.inv (tsqIP + U.T.dot(U))
        V    = varV.dot(U.T).dot(A)
        _quickPrintElbo ("E-Step: q(V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # A, varA
        # TODO, since only tau2sig2 changes at each step, would it be possible just to
        # amend the old inverse?
        # TODO Use sparse inverse
        varA = la.inv (tau2sig2 * XTX + np.eye(F))
        A    = varA.dot (U.dot(V) + X.T.dot(lmda))
        XA   = X.dot(A)
        _quickPrintElbo ("E-Step: q(A|V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
       
        #
        # lmda_dk
        lnVocab = safe_log (vocab)
        Z   = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxT
        rho = 2 * s[:,np.newaxis] * lxi - 0.5 \
            + np.einsum('dt,dkt->dk', W, Z) / docLen[:,np.newaxis]
        
        rhs  = docLen[:,np.newaxis] * rho + halfSig2 * X.dot(A)
        lmda = rhs / (docLen[:,np.newaxis] * 2 * lxi + halfSig2)
        
        _quickPrintElbo ("E-Step: q(Theta|A;lamda)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
              
        
        #
        # nu_dk
        # TODO Double check this again...
        nu = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + halfSig2)

        _quickPrintElbo ("E-Step: q(Theta|A;nu)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        # =============================================================
        # M-Step
        #    Parameters for the softmax bound: lxi and s
        #    The projection used for A: U
        #    The vocabulary : vocab
        #    The variances: tau, sigma
        # =============================================================
        
        #
        # s_d
#         s = (K/4. + (lxi * lmda).sum(axis = 1)) / lxi.sum(axis=1)
#         _quickPrintElbo ("M-Step: max s", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        

        #
        # xi_dk
        lxi = negJakkolaOfDerivedXi(lmda, nu, s)
        _quickPrintElbo ("M-Step: max xi", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # vocab
        #
        # TODO, since vocab is in the RHS, is there any way to optimize this?
        Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV
        vocab = normalizerows_ip (np.einsum('dt,dkt->kt', W, Z))
        _quickPrintElbo ("M-Step: max vocab", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # U
        U = A.dot(V.T).dot (la.inv(trTsqIK * varV + V.dot(V.T)))
        _quickPrintElbo ("M-Step: max U", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # sigma
        #    Equivalent to \frac{1}{DK} \left( \sum_d (\sum_k nu_{dk}) + tr(\Omega_A) x_d^{T} \Sigma_A x_d + (\lambda - A^{T} x_d)^{T}(\lambda - A^{T} x_d) \right)
        #
#        sigma = 1./(D*K) * (np.sum(nu) + D*K * tsq * np.sum(XTX * varA) + np.sum((lmda - XA)**2))
        
        #
        # tau
        #    Equivalent to \frac{1}{KF} \left( tr(\Sigma_A)tr(\Omega_A) + tr(\Sigma_V U U^{T})tr(\Omega_V) + tr ((M_A - U M_V)^{T} (M_A - U M_V)) \right)
        #
        varA_U = varA.dot(U)
#        tau_term1 = np.trace(varA)*K*tsq
#        tau_term2 = sum(varA_U[p,:].dot(U[p,:]) for p in xrange(P)) * K * tsq
#        tau_term3 = np.sum((A - U.dot(V)) ** 2)
#        
#        tau = 1./(K*F) * (tau_term1 + tau_term2 + tau_term3)
        
        if (logInterval > 0) and (iteration % logInterval == 0):
            elbo = varBound ( \
                VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \
                VbSideTopicQueryState(lmda, nu, lxi, s, docLen),
                X, W, Z, lnVocab, varA_U, XA, XTX)
                
            elbos[iteration / logInterval] = elbo
            iters[iteration / logInterval] = iteration
            print ("Iteration %5d  ELBO %f" % (iteration, elbo))
        
    if logInterval > 0:
        plot_bound(iters, elbos)
    
    return (VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \
            VbSideTopicQueryState (lmda, nu, lxi, s, docLen))
Esempio n. 9
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    W, X = data.words, data.feats
    D, _ = W.shape

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype

    # TODO Get ride of this via a command-line param
    iterations = max(iterations, 100)

    # Debugging
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0

    # Necessary values
    isigT = la.inv(sigT)

    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    for itr in range(iterations):
        # Counts of topic assignments
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                          out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab)
        S = expMeans * R.dot(vocab.T)

        # the variance
        varcs[:] = 1. / ((n *
                          (K - 1.) / K)[:, np.newaxis] + isigT.flat[::K + 1])
        debugFn(itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y,
                R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                n)

        # Update the Means
        rhs = X.dot(A.T).dot(isigT)
        rhs += S
        rhs += n[:, np.newaxis] * means.dot(Ab)
        rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means)

        # Long version
        inverses = dict()
        for d in range(D):
            if not n[d] in inverses:
                inverses[n[d]] = la.inv(isigT + n[d] * Ab)
            lhs = inverses[n[d]]
            means[d, :] = lhs.dot(rhs[d, :])
        debugFn(itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y,
                R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                n)

        like = log_likelihood(data, modelState,
                              QueryState(means, expMeans, varcs, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, queryState  # query vars altered in-place
Esempio n. 10
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype

    # Book-keeping for logs
    boundIters, boundValues, boundLikes = [], [], []
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0

    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG
                         )  # sort needs to be stable in order to be reversible
    W = W[sortIdx, :]  # deep sorted copy
    X = X[sortIdx, :]
    means, varcs = means[sortIdx, :], varcs[sortIdx, :]

    docLens = originalDocLens[sortIdx]

    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])

    # Initialize some working variables
    R = W.copy()

    aI_P = 1. / lfv * ssp.eye(P, dtype=dtype)

    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    leastSquares = lambda feats, targets: la.lstsq(
        feats, targets, lapack_driver="gelsy")[0].T
    if ssp.issparse(
            R_A):  # dense inverse typically as fast or faster than sparse
        R_A = to_dense_array(
            R_A)  # inverse and the result is usually dense in any case
        leastSquares = lambda feats, targets: np.array(
            [ssp.linalg.lsqr(feats, targets[:, k])[0] for k in range(K)])
    R_A.flat[::F + 1] += 1. / fv
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")

    priorSigt_diag = np.ndarray(shape=(K, ), dtype=dtype)
    priorSigt_diag.fill(0.001)

    # Iterate over parameters
    for itr in range(iterations):
        A = leastSquares(X, means)
        diff_a_yv = (A - Y.dot(V))

        for _ in range(10):  #(50 if itr == 0 else 1):
            # Update the covariance of the prior
            diff_m_xa = (means - X.dot(A.T))

            sigT = 1. / lfv * (Y.dot(Y.T))
            sigT += 1. / fv * diff_a_yv.dot(diff_a_yv.T)
            sigT += diff_m_xa.T.dot(diff_m_xa)
            sigT.flat[::K + 1] += varcs.sum(axis=0)

            # As small numbers lead to instable inverse estimates, we use the
            # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
            # scales whenever we use the inverse of the unscaled covariance
            sigScale = 1. / (P + D + F)
            isigScale = 1. / sigScale

            isigT = la.inv(sigT)
            debugFn(itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y,
                    lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                    docLens)

            # Update the vocabulary
            vocab *= (
                R.T.dot(expMeans)
            ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
            vocab += vocabPrior
            vocab = normalizerows_ip(vocab)

            # Reset the means to their original form, and log effect of vocab update
            R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
            S = expMeans * R.dot(vocab.T)
            debugFn(itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            # Update the Variances
            varcs = 1. / ((docLens * (K - 1.) / K)[:, np.newaxis] +
                          isigScale * isigT.flat[::K + 1])
            debugFn(itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            # Update the Means
            rhs = X.dot(A.T).dot(isigT) * isigScale
            rhs += S
            rhs += docLens[:, np.newaxis] * means.dot(Ab)
            rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)

            # Faster version?
            for lenIdx in range(len(lens)):
                nd = lens[lenIdx]
                start, end = inds[lenIdx], inds[lenIdx + 1]
                lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale

                means[start:end, :] = rhs[start:end, :].dot(
                    lhs
                )  # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

    #       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
            debugFn(itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                              out=expMeans)

        # for _ in range(150):
        #     # Finally update the parameter V
        #     V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        #     debugFn(itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)
        #
        #     # Update the distribution on the latent space
        #     R_Y_base = aI_P + 1 / fv * V.dot(V.T)
        #     R_Y = la.inv(R_Y_base)
        #     debugFn(itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype,
        #             means, varcs, Ab, docLens)
        #
        #     Y = 1. / fv * A.dot(V.T).dot(R_Y)
        #     debugFn(itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)
        #
        #     # Update the mapping from the features to topics
        #     A = (1. / fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        #     debugFn(itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V,
                                    sigT * sigScale, vocab, vocabPrior, Ab,
                                    dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues.append(
                var_bound(DataSet(W, feats=X), modelState, queryState, XTX))
            boundLikes.append(
                log_likelihood(DataSet(W, feats=X), modelState, queryState))
            boundIters.append(itr)
            perp = perplexity_from_like(boundLikes[-1], docLens.sum())
            print(
                time.strftime('%X') +
                " : Iteration %d: Perplexity %4.0f bound %f" %
                (itr, perp, boundValues[-1]))
            if len(boundIters) >= 2 and boundValues[-2] > boundValues[-1]:
                printStderr("ERROR: bound degradation: %f > %f" %
                            (boundValues[-2], boundValues[-1]))


#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

# Check to see if the improvement in the likelihood has fallen below the threshold
            if len(boundIters) > 2 and boundIters[-1] > 20:
                lastPerp = perplexity_from_like(boundLikes[-2], docLens.sum())
                if lastPerp - perp < 1:
                    break

    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means = means[revert_sort, :]
    varcs = varcs[revert_sort, :]
    docLens = docLens[revert_sort]

    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
    def _sampleFromModel(self,
                         D=200,
                         T=100,
                         K=10,
                         F=12,
                         P=8,
                         avgWordsPerDoc=500):
        '''
        Create a test dataset according to the model
        
        Params:
            T - Vocabulary size, the number of "terms". Must be a square number
            K - Observed topics
            P - Latent features
            F - Observed features
            D - Sample documents (each with associated features)
            avgWordsPerDoc - average number of words per document generated (Poisson)
        
        Returns:
            modelState - a model state object configured for training
            tpcs       - the matrix of per-document topic distribution
            vocab      - the matrix of per-topic word distributions
            docLens    - the vector of document lengths
            X          - the DxF side information matrix
            W          - The DxW word matrix
        '''

        # Generate vocab
        beta = 0.1
        betaVec = np.ndarray((T, ))
        betaVec.fill(beta)
        vocab = np.zeros((K, T))
        for k in range(K):
            vocab[k, :] = rd.dirichlet(betaVec)

        # Geneate the shared covariance matrix
        sigT = rd.random((K, K))
        sigT = sigT.dot(sigT)
        sigT.flat[::K + 1] += rd.random((K, )) * 4

        # Just link two topics
        sigT[K // 2, K // 3] = 3
        sigT[K // 3, K // 2] = 3

        sigT[4 * K // 5, K // 5] = 4
        sigT[K // 5, 4 * K // 5] = 4

        # Generate Y, then V, then A
        lfv = 0.1  # latent feature variance (for Y)
        fv = 0.1  # feature variance (for A)

        Y = matrix_normal(np.zeros((K, P)), lfv * np.eye(P), sigT)
        V = matrix_normal(np.zeros((P, F)), fv * np.eye(F), lfv * np.eye(P))
        A = matrix_normal(Y.dot(V), fv * np.eye(F), sigT)

        # Generate the input features. Assume the features are multinomial and sparse
        # (not quite a perfect match for the twitter example: twitter is binary, this
        # may not be)
        featuresDist = [1. / F] * F
        maxNonZeroFeatures = 3

        X = np.zeros((D, F), dtype=np.float32)
        for d in range(D):
            X[d, :] = rd.multinomial(maxNonZeroFeatures, featuresDist)
        X = ssp.csr_matrix(X)

        # Use the features and the matrix A to generate the topics and documents
        tpcs = rowwise_softmax(X.dot(A.T))

        docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32)
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        # Return the initialised model, the true parameter values, and the
        # generated observations
        return tpcs, vocab, docLens, X, W
Esempio n. 12
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(
        data.links.T), data.feats
    D, _ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens
    K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    if debug:
        debugFn = _debug_with_bound

        initLikely = log_likelihood(data, modelState, queryState)
        initPerp = perplexity_from_like(initLikely, data.word_count)
        print("Initial perplexity is: %.2f" % initPerp)
    else:
        debugFn = _debug_with_nothing

    # Initialize some working variables
    W_weight = W.copy()
    L_weight = L.copy()
    LT_weight = LT.copy()

    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K, ), dtype=dtype)
    priorSigT_diag.fill(NIW_PSI)

    # Iterate over parameters
    for itr in range(iterations):

        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn(itr, topicMean, "topicMean", data, K, topicMean, topicCov,
                vocab, dtype, means, varcs, A, docLens)

        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis, :]
            topicCov = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            topicCov /= (D + pseudoObsVar - K)
        else:
            topicCov = np.cov(
                means.T) if topicCov.dtype == np.float64 else np.cov(
                    means.T).astype(dtype)
            topicCov += np.diag(varcs.mean(axis=0))

        if diagonalPriorCov:
            diag = np.diag(topicCov)
            topicCov = np.diag(diag)
            itopicCov = np.diag(1. / diag)
        else:
            itopicCov = la.inv(topicCov)

        debugFn(itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)
        #        print("                topicCov.det = " + str(la.det(topicCov)))

        # Building Blocks - temporarily replaces means with exp(means)
        expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :])
        lse_at_k = np.sum(expMeansCol, axis=0)
        F = 0.5 * means \
          - (1. / (2*D + 2)) * means.sum(axis=0) \
          - expMeansCol / lse_at_k[np.newaxis, :]

        expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis])
        W_weight = sparseScalarQuotientOfDot(W,
                                             expMeansRow,
                                             vocab,
                                             out=W_weight)

        # Update the vocabularies

        vocab *= (
            W_weight.T.dot(expMeansRow)
        ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += VocabPrior
        vocab = normalizerows_ip(vocab)

        docVocab = (
            expMeansCol /
            lse_at_k[np.newaxis, :]).T  # FIXME Dupes line in definitino of F

        # Recalculate w_top_sums with the new vocab and log vocab improvement
        W_weight = sparseScalarQuotientOfDot(W,
                                             expMeansRow,
                                             vocab,
                                             out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        debugFn(itr, vocab, "vocab", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        # Now do likewise for the links, do it twice to model in-counts (first) and
        # out-counts (Second). The difference is the transpose
        LT_weight = sparseScalarQuotientOfDot(LT,
                                              expMeansRow,
                                              docVocab,
                                              out=LT_weight)
        l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow
        in_counts = l_intop_sums.sum(axis=0)

        L_weight = sparseScalarQuotientOfDot(L,
                                             expMeansRow,
                                             docVocab,
                                             out=L_weight)
        l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow

        # Reset the means and use them to calculate the weighted sum of means
        meanSum = means.sum(axis=0) * in_counts

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1}
        varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1. / K) +
                              np.diagonal(topicCov))
        debugFn(itr, varcs, "varcs", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        # Update the Means
        rhs = w_top_sums.copy()
        rhs += l_intop_sums
        rhs += l_outtop_sums
        rhs += itopicCov.dot(topicMean)
        rhs += emit_counts[:, np.newaxis] * (means.dot(A) -
                                             rowwise_softmax(means))
        rhs += in_counts[np.newaxis, :] * F
        if diagonalPriorCov:
            raise ValueError("Not implemented")
        else:
            for d in range(D):
                rhs_ = rhs[d, :] + (1. /
                                    (4 * D + 4)) * (meanSum -
                                                    in_counts * means[d, :])
                means[d, :] = la.inv(itopicCov + emit_counts[d] * A +
                                     np.diag(D * in_counts /
                                             (2 * D + 2))).dot(rhs_)
                if np.any(np.isnan(means[d, :])) or np.any(
                        np.isinf(means[d, :])):
                    pass

                if np.any(np.isnan(
                        np.exp(means[d, :] - means[d, :].max()))) or np.any(
                            np.isinf(np.exp(means[d, :] - means[d, :].max()))):
                    pass

        debugFn(itr, means, "means", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype,
                                    MODEL_NAME)
            queryState = QueryState(means, varcs, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime('%X') +
                " : Iteration %d: bound %f \t Perplexity: %.2f" %
                (itr, boundValues[-1],
                 perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    printStderr("ERROR: bound degradation: %f > %f" %
                                (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if False and itr > 100 and abs(
                        perplexity_from_like(likelyValues[-1], docLens.sum()) -
                        perplexity_from_like(likelyValues[-2], docLens.sum())
                ) < 1.0:
                    break


    return \
        ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \
        QueryState(means, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))