Esempio n. 1
0
def var_bound(data, model, query):
    '''
    A total nonsense in this case which we retain just so all the other functions
    continue to work.
    '''
    bound = 0

    # Unpack the the structs, for ease of access and efficiency
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Initialize z matrix if necessary
    W = data.words
    D, T = W.shape

    #  ln p(x,z) >= sum_k p(z=k|x) * (ln p(x|z=k, phi) + p(z=k)) + H[q]

    # Expected joint
    like = W.dot(safe_log(wordDists).T) # D*K
    like *= safe_log(topicMeans)

    # Entropy
    ent = (-topicMeans * safe_log(topicMeans)).sum()

    return like.sum() + bound
Esempio n. 2
0
def var_bound(data, model, query):
    '''
    Determines the variational bounds.
    '''
    bound = 0

    # Unpack the the structs, for ease of access and efficiency
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype

    # Initialize z matrix if necessary
    W = data.words
    D, T = W.shape

    #  ln p(x,z) >= sum_k p(z=k|x) * (ln p(x|z=k, phi) + p(z=k)) + H[q]

    # Expected joint
    like = W.dot(safe_log(wordDists).T) # D*K
    like += corpusTopicDist[np.newaxis,:]
    like *= safe_log(topicMeans)

    # Entropy
    ent = (-topicMeans * safe_log(topicMeans)).sum()

    return like.sum() + bound
Esempio n. 3
0
def query(data, model, queryState, _):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint, without altering the model

    Params:
    W - the DxT document-term matrix
    X - The DxD document-document matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    K, wordDists, corpusTopicDist, topicPrior, vocabPrior = \
        model.K, model.wordDists, model.corpusTopicDist, model.topicPrior, model.vocabPrior
    topicDists = queryState.topicDists

    W = data.words

    wordDists = safe_log(wordDists)
    corpusTopicDist = safe_log(corpusTopicDist)

    topicDists = W.dot(wordDists.T) + corpusTopicDist[np.newaxis, :]
    norms = fns.logsumexp(topicDists, axis=1)
    topicDists -= norms[:, np.newaxis]

    return model, QueryState(queryState.docLens, np.exp(topicDists), True)
 def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
     print ("Fully handcrafted example, K=Q")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 6   # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     lmda = np.zeros((D,K))
     X    = np.zeros((D,F))
     for d in range(D):
         for _ in range(3):
             lmda[d,rd.randint(K)] += 1./3
         for _ in range(int(F/3)):
             X[d,rd.randint(F)] += 1
     
     A = rd.random((K,F))
     X = lmda.dot(la.pinv(A).T)
     X = ssp.csr_matrix(X)
     
     tpcs = lmda
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.square(W - W_inf)) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,))
     
     print("End of Test")
def _sparseScalarProductOfSafeLnDot_py(A,B,C, out=None):
    '''
    Calculates A * B.dot(C) where A is a sparse matrix
    
    Retains sparsity in the result, unlike the built-in operator
    
    Note the type of the return-value is the same as the type of
    the sparse matrix A. If this has an integral type, this will
    only provide integer-based multiplication.
    '''
    if WarnIfSlow:
        sys.stderr.write("WARNING: Slow code path triggered (_sparseScalarProductOfSafeLnDot_py)")
        
    if not (A.dtype == B.dtype and B.dtype == C.dtype and (out is None or C.dtype == out.dtype)):
        raise ValueError ("Inconsistent dtypes in the three matrices and possibly the out-param")
        
    if out is None:
        out = A.copy()
    else:
        out.data[:] = A.data
    
    rhs = B.dot(C)
    rhs[rhs < sys.float_info.min] = sys.float_info.min
    out.data *= safe_log(rhs)[csr_indices(out.indptr, out.indices)]
    
    return out
Esempio n. 6
0
def sample_memberships(W, alpha, wordDists, memberships):
    _, K = memberships.shape

    priorNum = memberships.sum(axis=0) + alpha - 1
    prior = priorNum.copy()
    sample_dists = W.dot(safe_log(wordDists).T)  # d x k

    for d in range(W.shape[0]):
        priorNum -= memberships[d, :]
        prior[:] = priorNum
        prior /= priorNum.sum()

        sample_dists[d, :] += safe_log(prior)
        sample_dists[d, :] -= sample_dists[d, :].max()
        sample_dists[d, :] -= fns.logsumexp(sample_dists[d, :])

        np.exp(sample_dists[d, :], out=sample_dists[d, :])
        memberships[d, :] = rd.multinomial(1, sample_dists[d, :], size=1)

        priorNum += memberships[d, :]

    return memberships
Esempio n. 7
0
def iterate (iterations, D, K, T, \
             W_list, docLens, \
             topicPrior, vocabPrior, \
             z_dnk, topicDists, wordDists):
    
    raise ValueError("This implementation no longer supported")
    totalItrs = 0
    epsilon = 0.01 / K
    oldWordDists = np.empty(wordDists.shape, wordDists.dtype)
    newWordDists = wordDists
    
    
    for _ in range(iterations):
        oldWordDists, newWordDists = newWordDists, oldWordDists
        lnWordDists = safe_log(oldWordDists, out=oldWordDists)
        newWordDists.fill (vocabPrior)
        
        for d in range(D):
            oldTopics = topicDists[d,:].copy()
            topicDists[d,:]= 1./ K
            lnWordProbs = lnWordDists[:,W_list[d,0:docLens[d]]]
            
            innerItrs = 0
            while ((innerItrs < MaxInnerItrs) or (np.sum(np.abs(oldTopics - topicDists[d,:])) > epsilon)) \
            and (innerItrs < MaxInnerItrs):
                diTopic     = fns.digamma(topicDists[d,:])
                z_dnk[:docLens[d],:] = lnWordProbs.T + diTopic[np.newaxis,:]
                
                # We've been working in log-space till now, before we go to true
                # probability space rescale so we don't underflow everywhere
                maxes  = z_dnk.max(axis=1)
                z_dnk -= maxes[:,np.newaxis]
                np.exp(z_dnk, out=z_dnk)
                
                # Now normalize so probabilities sum to one
                sums   = z_dnk.sum(axis=1)
                z_dnk /= sums[:,np.newaxis]            # Update vocabulary: hard to do with a list representation

                # Now use it to infer the topic distribution
                topicDists[d,:] = topicPrior + np.sum(z_dnk[:docLens[d],:], axis=0)
                topicDists[d,:] /= np.sum(topicDists[d,:])
                
                innerItrs += 1
            
            totalItrs += innerItrs
            for k in range(K):
                for n in range(docLens[d]):
                    newWordDists[k,W_list[d,n]] += z_dnk[n,k]
            newWordDists /= newWordDists.sum(axis=1)[:,np.newaxis]
        
    return totalItrs
Esempio n. 8
0
def varBound(modelState, queryState, X, W, lnVocab=None, XAT=None, XTX=None, scaledWordCounts=None, VTV=None, UTU=None):
    #
    # TODO Standardise hyperparameter handling so we can eliminate this copy and paste
    #

    # Unpack the model and query state tuples for ease of use and maybe speed improvements
    K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, _, alphaSq, kappaSq, tauSq = (
        modelState.K,
        modelState.Q,
        modelState.F,
        modelState.P,
        modelState.T,
        modelState.A,
        modelState.varA,
        modelState.Y,
        modelState.omY,
        modelState.sigY,
        modelState.sigT,
        modelState.U,
        modelState.V,
        modelState.vocab,
        modelState.topicVar,
        modelState.featVar,
        modelState.lowTopicVar,
        modelState.lowFeatVar,
    )
    (expLmda, nu, lxi, s, docLen) = (queryState.expLmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen)

    lmda = np.log(expLmda)
    isigT = la.inv(sigT)
    lnDetSigT = la.det(sigT)
    sigmaSq = 1  # A bit of a hack till hyperparameter handling is standardised

    # Get the number of samples from the shape. Ensure that the shapes are consistent
    # with the model parameters.
    (D, Tcheck) = W.shape
    if Tcheck != T:
        raise ValueError(
            "The shape of the DxT document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)"
            % (T, D, Tcheck)
        )

    (Dcheck, Fcheck) = X.shape
    if Dcheck != D:
        raise ValueError("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D))
    if Fcheck != F:
        raise ValueError(
            "The shape of the DxF feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)"
            % (F, Dcheck, Fcheck)
        )

    # We'll need the original xi for this and also Z, the 3D tensor of which for each document D
    # and term T gives the strength of topic K. We'll also need the log of the vocab dist
    xi = deriveXi(lmda, nu, s)

    # If not already provided, we'll also need the following products
    #
    if XAT is None:
        XAT = X.dot(A.T)
    if XTX is None:
        XTX = X.T.dot(X)
    if V is not None and VTV is None:
        VTV = V.T.dot(V)
    if U is not None and UTU is None:
        UTU = U.T.dot(U)

    # also need one over the usual variances
    overSsq, overAsq, overKsq, overTsq = 1.0 / sigmaSq, 1.0 / alphaSq, 1.0 / kappaSq, 1.0 / tauSq
    overTkSq = overTsq * overKsq
    overAsSq = overAsq * overSsq

    # <ln p(Y)>
    #
    trSigY = 1 if sigY is None else np.trace(sigY)
    trOmY = K  # Basically it's the trace of the identity matrix as the posterior and prior cancel out
    lnP_Y = -0.5 * (
        Q * P * LOG_2PI + P * lnDetSigT + overTkSq * trSigY * trOmY + overTkSq * np.trace(isigT.dot(Y).dot(Y.T))
    )

    # <ln P(A|Y)>
    # TODO it looks like I should take the trace of omA \otimes I_K here.
    # TODO Need to check re-arranging sigY and omY is sensible.
    halfKF = 0.5 * K * F

    # Horrible, but varBound can be called by two implementations, one with Y as a matrix-variate
    # where sigY is QxQ and one with Y as a multi-varate, where sigY is a QPxQP.
    A_from_Y = Y.dot(U.T) if V is None else U.dot(Y).dot(V.T)
    A_diff = A - A_from_Y
    varFactorU = np.trace(sigY.dot(np.kron(VTV, UTU))) if sigY.shape[0] == Q * P else np.sum(sigY * UTU)
    varFactorV = 1 if V is None else np.sum(omY * V.T.dot(V))
    lnP_A = (
        -halfKF * LOG_2PI
        - halfKF * log(alphaSq)
        - F / 2.0 * lnDetSigT
        - 0.5 * (overAsSq * varFactorV * varFactorU + np.trace(XTX.dot(varA)) * K + np.sum(isigT.dot(A_diff) * A_diff))
    )

    # <ln p(Theta|A,X)
    #
    lmdaDiff = lmda - XAT
    lnP_Theta = (
        -0.5 * D * LOG_2PI
        - 0.5 * D * lnDetSigT
        - 0.5 / sigmaSq * (np.sum(nu) + D * K * np.sum(XTX * varA) + np.sum(lmdaDiff.dot(isigT) * lmdaDiff))
    )
    # Why is order of sigT reversed? It's 'cause we've not been consistent. A is KxF but lmda is DxK, and
    # note that the distribution of lmda transpose has the same covariances, just in different positions
    #  (i.e. row is col and vice-versa)

    # <ln p(Z|Theta)
    #
    docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi
    scaledWordCounts = sparseScalarQuotientOfDot(W, expLmda, vocab, out=scaledWordCounts)

    lnP_Z = 0.0
    lnP_Z -= np.sum(docLenLmdaLxi * lmda)
    lnP_Z -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi)
    lnP_Z += 2 * np.sum(s[:, np.newaxis] * docLenLmdaLxi)
    lnP_Z -= 0.5 * np.sum(docLen[:, np.newaxis] * lmda)
    lnP_Z += np.sum(
        lmda * expLmda * (scaledWordCounts.dot(vocab.T))
    )  # n(d,k) = expLmda * (scaledWordCounts.dot(vocab.T))
    lnP_Z -= np.sum(docLen[:, np.newaxis] * lxi * ((s ** 2)[:, np.newaxis] - xi ** 2))
    lnP_Z += 0.5 * np.sum(docLen[:, np.newaxis] * (s[:, np.newaxis] + xi))
    lnP_Z -= np.sum(docLen[:, np.newaxis] * safe_log_one_plus_exp_of(xi))
    lnP_Z -= np.sum(docLen * s)

    # <ln p(W|Z, vocab)>
    #
    lnP_w_dt = sparseScalarProductOfDot(scaledWordCounts, expLmda, vocab * safe_log(vocab))
    lnP_W = np.sum(lnP_w_dt.data)

    # H[q(Y)]
    lnDetOmY = 0 if omY is None else log(la.det(omY))
    lnDetSigY = 0 if sigY is None else log(max(la.det(sigY), sys.float_info.min))  # TODO FIX THIS
    ent_Y = 0.5 * (P * K * LOG_2PI_E + Q * lnDetOmY + P * lnDetSigY)

    # H[q(A|Y)]
    #
    # A few things - omA is fixed so long as tau and sigma are, so there's no benefit in
    # recalculating this every time.
    #
    # However in a recent test, la.det(omA) = 0
    # this is very strange as omA is the inverse of (s*I + t*XTX)
    #
    #    ent_A = 0.5 * (F * K * LOG_2PI_E + K * log (la.det(omA)) + F * K * log (tau2))\
    ent_A = 0

    # H[q(Theta|A)]
    ent_Theta = 0.5 * (K * LOG_2PI_E + np.sum(np.log(nu * nu)))

    # H[q(Z|\Theta)
    #
    # So Z_dtk \propto expLmda_dt * vocab_tk. We let N here be the normalizer (which is
    # \sum_j expLmda_dt * vocab_tj, which implies N is DxT. We need to evaluate
    # Z_dtk * log Z_dtk. We can pull out the normalizer of the first term, but it has
    # to stay in the log Z_dtk expression, hence the third term in the sum. We can however
    # take advantage of the ability to mix dot and element-wise products for the different
    # components of Z_dtk in that three-term sum, which we denote as S
    #   Finally we use np.sum to sum over d and t
    #
    ent_Z = 0  # entropyOfDot(expLmda, vocab)

    result = lnP_Y + lnP_A + lnP_Theta + lnP_Z + lnP_W + ent_Y + ent_A + ent_Theta + ent_Z

    return result
Esempio n. 9
0
def var_bound(data, modelState, queryState):
    '''
    Determines the variational bounds. Values are mutated in place, but are
    reset afterwards to their initial values. So it's safe to call in a serial
    manner.
    '''
    
    # Unpack the the structs, for ease of access and efficiency
    W, L, X  = data.words, data.links, data.feats
    D,_ = W.shape
    outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens
    K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype

    # Calculate some implicit  variables
    itopicCov = la.inv(topicCov)
    
    bound = 0

    expMeansOut = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis])
    expMeansIn  = np.exp(inMeans - inMeans.max(axis=0)[np.newaxis, :])
    lse_at_k    = expMeansIn.sum(axis=0)

    # Distribution over document topics
    bound -= (D*K)/2. * LN_OF_2_PI
    bound -= D/2. * safe_log_det(outDocCov * topicCov)
    diff   = outMeans - topicMean[np.newaxis,:]
    bound -= 0.5 * np.sum (diff.dot(itopicCov) * diff * 1./outDocCov)
    bound -= (0.5 / outDocCov) * np.sum(outVarcs * np.diag(itopicCov)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only.

    # And its entropy
    bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.log(outVarcs).sum()

    # Distribution over document in-links
    inDocPre = np.reciprocal(inDocCov)
    bound -= (D*K)/2. * LN_OF_2_PI
    bound -= D/2. * safe_log_det(topicCov)
    bound -= K/2 * safe_log(inDocCov).sum()
    diff   = inMeans - outMeans
    bound -= 0.5 * np.sum (diff.dot(itopicCov) * diff * inDocPre[:,np.newaxis])
    bound -= 0.5 * np.sum((inVarcs * inDocPre[:,np.newaxis]) * np.diag(itopicCov)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only.

    # And its entropy
    bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.log(inVarcs).sum()

    # Distribution over topic assignments E[p(Z)] and E[p(Y)]
    W_weights  = sparseScalarQuotientOfDot(W, expMeansOut, vocab)  # D x V   [W / TB] is the quotient of the original over the reconstructed doc-term matrix
    top_sums   = expMeansOut * (W_weights.dot(vocab.T)) # D x K

    L_weights  = sparseScalarQuotientOfNormedDot(L, expMeansOut, expMeansIn, lse_at_k)
    top_sums  += expMeansOut * (L_weights.dot(expMeansIn) / lse_at_k[np.newaxis, :])

    # E[p(Z,Y)]
    linkLens = np.squeeze(np.array(L.sum(axis=1)))
    bound += np.sum(outMeans * top_sums)
    bound -= np.sum((docLens + linkLens) * np.log(np.sum(expMeansOut, axis=1)))

    # H[Z]
    bound += ((W_weights.dot(vocab.T)) * expMeansOut * outMeans).sum() \
           + ((W_weights.dot((np.log(vocab) * vocab).T)) * expMeansOut).sum() \
           - np.trace(sparseScalarProductOfSafeLnDot(W_weights, expMeansOut, vocab).dot(vocab.T).dot(expMeansOut.T))

    # H[Y]
    docVocab = (expMeansIn / lse_at_k[np.newaxis,:]).T.copy()
    bound += ((L_weights.dot(docVocab.T)) * expMeansOut * outMeans).sum() \
           + ((L_weights.dot((np.log(docVocab) * docVocab).T)) * expMeansOut).sum() \
           - np.trace(sparseScalarProductOfSafeLnDot(L_weights, expMeansOut, docVocab).dot(docVocab.T).dot(expMeansOut.T))

    # E[p(W)]
    vlv = np.log(vocab) * vocab
    bound += np.trace(expMeansOut.T.dot(W_weights.dot(vlv.T)))

    # E[p(L)
    dld = np.log(docVocab) * docVocab
    bound += np.trace(expMeansOut.T.dot(L_weights.dot(dld.T)))

    return bound
Esempio n. 10
0
def var_bound(data, modelState, queryState, z_dnk = None):
    '''
    Determines the variational bounds.
    '''
    # Unpack the the structs, for ease of access and efficiency
    W_list, docLens, topicDists = \
        queryState.W_list, queryState.docLens, queryState.topicDists
    K, topicPrior, vocabPrior, _, dtype = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype

    W   = data.words
    D,T = W.shape
    maxN = docLens.max()
    if z_dnk == None:
        z_dnk = np.empty(shape=(maxN, K), dtype=dtype)
    
    wordDistsMatrix = wordDists(modelState)
        
    diWordDists = fns.digamma(wordDistsMatrix.copy()) - fns.digamma(wordDistsMatrix.sum(axis=1))[:,np.newaxis]
    lnWordDists = np.log(wordDistsMatrix)
   
    bound = 0
    
    # Expected Probablity
    #
    
    # P(topics|topicPrior)
    diTopicDists = fns.digamma(topicDists) - fns.digamma(topicDists.sum(axis=1))[:,np.newaxis]
    ln_b_topic = fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum()
    bound += D * ln_b_topic \
           + np.sum((topicPrior - 1) * diTopicDists)
    
    # and its entropy
    ent = fns.gammaln(topicDists.sum(axis=1)).sum() - fns.gammaln(topicDists).sum() \
        + np.sum ((topicDists - 1) * diTopicDists)
    
    bound -= ent
    
    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    # NOTE COPY AND PASTED FROM iterate_f32  / iterate_f64 (-ish)
    for d in range(D):
        lnWordProbs = lnWordDists[:,W_list[d,0:docLens[d]]]
        diTopic     = fns.digamma(topicDists[d,:])
        z_dnk[0:docLens[d],:] = lnWordProbs.T + diTopic[np.newaxis,:]
        
        # We've been working in log-space till now, before we go to true
        # probability space rescale so we don't underflow everywhere
        maxes  = z_dnk.max(axis=1)
        z_dnk -= maxes[:,np.newaxis]
        np.exp(z_dnk, out=z_dnk)
        
        # Now normalize so probabilities sum to one
        sums   = z_dnk.sum(axis=1)
        z_dnk /= sums[:,np.newaxis]
#        z_dnk[docLens[d]:maxN,:] = 0 # zero probablities for words that don't exist
        
        # Now use to calculate  E[ln p(Z|topics), E[ln p(W|Z) and H[Z] in that order
        diTopic -= fns.digamma(np.sum(topicDists[d,:]))
        bound += np.sum(z_dnk * diTopic[np.newaxis,:])
        bound += np.sum(z_dnk[0:docLens[d],:].T * diWordDists[:,W_list[d,0:docLens[d]]])
        bound -= np.sum(z_dnk[0:docLens[d],:] * safe_log(z_dnk[0:docLens[d],:]))
        
    # p(vocabDists|vocabPrior)
    
    ln_b_vocab = fns.gammaln(T * vocabPrior) - T * fns.gammaln(vocabPrior)
    bound += K * ln_b_vocab \
           + (vocabPrior - 1) * np.sum(diWordDists)
    
    # and its entropy
    ent = fns.gammaln(wordDistsMatrix.sum(axis=1)).sum() - fns.gammaln(wordDistsMatrix).sum() \
        + np.sum ((wordDistsMatrix - 1) * diWordDists)
    
    bound -= ent   
    
    return bound
    def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
        print("Fully handcrafted example, K=Q")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 6  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        lmda = np.zeros((D, K))
        X = np.zeros((D, F))
        for d in range(D):
            for _ in range(3):
                lmda[d, rd.randint(K)] += 1. / 3
            for _ in range(int(F / 3)):
                X[d, rd.randint(F)] += 1

        A = rd.random((K, F))
        X = lmda.dot(la.pinv(A).T)
        X = ssp.csr_matrix(X)

        tpcs = lmda

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           iterations=1)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)
        priorReconsError = np.sum(np.square(W - W_inf)) / D

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=100,
                                           iterations=130)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Model Driven: Prior Reconstruction Error: %f" %
              (priorReconsError, ))
        print("Model Driven: Final Reconstruction Error: %f" %
              (np.sum(np.square(W - W_inf)) / D, ))

        print("End of Test")
Esempio n. 12
0
def var_bound(data, model, query, z_dnk = None):
    '''
    Determines the variational bounds.
    '''
    bound = 0
    
    # Unpack the the structs, for ease of access and efficiency
    K, topicPrior, wordPrior, wordDists, weights, negCount, reg, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype
    docLens, topicDists = \
        query.docLens, query.topicDists

    W,X = data.words, data.links
    D,T = W.shape
    minNonZero = 1E-300 if dtype is np.float64 else 1E-30
        
    # Perform the digamma transform for E[ln \theta] etc.
    topicDists      = topicDists.copy()
    diTopicDists    = fns.digamma(topicDists[:, :K])
    diSumTopicDists = fns.digamma(topicDists[:, :K].sum(axis=1))
    diWordDists     = fns.digamma(model.wordDists)
    diSumWordDists  = fns.digamma(model.wordDists.sum(axis=1))

    # E[ln p(topics|topicPrior)] according to q(topics)
    #
    prob_topics = D * (fns.gammaln(topicPrior[:K].sum()) - fns.gammaln(topicPrior[:K]).sum()) \
        + np.sum((topicPrior[:K] - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis]))

    bound += prob_topics

    # and its entropy
    ent_topics = _dirichletEntropy(topicDists[:, :K])
    bound += ent_topics
        
    # E[ln p(vocabs|vocabPrior)]
    #
    if type(model.vocabPrior) is float or type(model.vocabPrior) is int:
        prob_vocabs  = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \
               + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:,np.newaxis] ))
    else:
        prob_vocabs  = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \
               + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:,np.newaxis] ))

    bound += prob_vocabs

    # and its entropy
    ent_vocabs = _dirichletEntropy(wordDists)
    bound += ent_vocabs

    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior)

    prob_words = 0
    prob_z     = 0
    ent_z      = 0
    for d in range(D):
        wordIdx, z = _infer_topics_at_d(d, data, weights, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        exLnTopic = diTopicDists[d, :K] - diSumTopicDists[d]
        prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum()

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis]))
        
        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

    bound += (prob_z + ent_z + prob_words)

    # Next, the distribution over links - we just focus on the positives in this case
    for d in range(D):
        links   = _links_up_to(d, X)
        if len(links) == 0:
            continue

        scores  = topicMeans[links, :].dot(weights * topicMeans[d])
        probs   = _probit_inplace(scores) + minNonZero
        lnProbs = np.log(probs, out=probs)

        # expected probability of all links from d to p < d such that y_dp = 1
        bound += lnProbs.sum()

    _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
    return bound
Esempio n. 13
0
def var_bound(data, model, query, z_dnk = None):
    '''
    Determines the variational bounds.
    '''
    bound = 0
    
    # Unpack the the structs, for ease of access and efficiency
    K, topicPrior, wordPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype
    docLens, topicDists = \
        query.docLens, query.topicDists

    # Initialize z matrix if necessary
    W,X = data.words, data.links
    D,T = W.shape
        
    # Perform the digamma transform for E[ln \theta] etc.
    topicDists      = topicDists.copy()
    diTopicDists    = fns.digamma(topicDists)
    diSumTopicDists = fns.digamma(topicDists.sum(axis=1))
    diWordDists     = fns.digamma(model.wordDists)
    diSumWordDists  = fns.digamma(model.wordDists.sum(axis=1))

    # E[ln p(topics|topicPrior)] according to q(topics)
    #
    prob_topics = D * (fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum()) \
        + np.sum((topicPrior - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis]))

    bound += prob_topics

    # and its entropy
    ent_topics = _dirichletEntropy(topicDists)
    bound += ent_topics
        
    # E[ln p(vocabs|vocabPrior)]
    #
    if type(model.vocabPrior) is float or type(model.vocabPrior) is int:
        prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \
               + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:, np.newaxis] ))
    else:
        prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \
               + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:, np.newaxis] ))

    bound += prob_vocabs

    # and its entropy
    ent_vocabs = _dirichletEntropy(wordDists)
    bound += ent_vocabs

    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior)

    prob_words = 0
    prob_z     = 0
    ent_z      = 0
    for d in range(D):
        wordIdx, z = _infer_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        exLnTopic = diTopicDists[d, :] - diSumTopicDists[d]
        prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum()

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis]))
        
        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

    bound += (prob_z + ent_z + prob_words)

    _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
    return bound
Esempio n. 14
0
def var_bound(data, model, query, z_dnk = None):
    '''
    Determines the variational bounds.
    '''
    bound = 0
    
    # Unpack the the structs, for ease of access and efficiency
    docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \
        query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts
    K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \
	    model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name

    W, L = data.words, data.links
    D, T = W.shape
    bound = 0

    # Pre-calculate some repeated expressinos
    logVagueness = log(Vagueness)
    halfDQ, halfQK, halfDK = 0.5*D*Q, 0.5*Q*K, 0.5*D*K
    logTwoPi  = log(2 * pi)
    logTwoPiE = log(2 * pi * e)

    # # E[ln p(U)]
    # bound += -halfDQ * logTwoPi - D * Q * logVagueness - 0.5 * np.sum(U * U) # trace of U U'
    #
    # # H[q(U)]
    # bound += -halfDQ * logTwoPiE - D * Q * logVagueness
    #
    # # E[ln p(V)]
    # bound += -halfQK * logTwoPi - Q * K * logVagueness - 0.5 * np.sum(V * V) # trace of U U'
    #
    # # H[q(V)]
    # bound += -halfQK * logTwoPiE - D * Q * logVagueness

    # ln p(Topics|U, V)
    logDetCov = log(la.det(topicCov))
    kernel = topics.copy()
    kernel -= U.dot(V)
    kernel **= 2
    kernel[:] = kernel.dot(topicCov)
    kernel /= (2 * Vagueness)
    bound += -halfDK * logTwoPi - halfDK * logVagueness \
             -D * 0.5 * logDetCov \
             -np.sum(kernel) \
             -np.sum(postTopicCov)
    # FIXME bound here is squiffy

    # H[q(topics)]
    bound += -halfDK * logTwoPiE - halfDK * logVagueness - D * 0.5 * logDetCov

    # We'll need these for the next steps
    diWordDists     = fns.digamma(wordDists)
    diWordDistSums  = fns.digamma(wordDists.sum(axis=1))

    # P(z|topic) and P(y|topic) are not stored explicitly, so we need to
    # recalculate here to calculate their expected log-probs and entropies.
    prob_words, prob_links = 0, 0
    prob_z, ent_z = 0, 0
    prob_y, ent_y   = 0, 0
    for d in range(D):
        # First the word-topic assignments, note this is a KxV matrix
        wordIdx, z = _infer_word_topics_at_d(d, W, topics, diWordDists, diWordDistSums)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        prob_z += topics[d, :].dot(z * W[d, :].data[np.newaxis, :]).sum()
        prob_z -= docLens[d] * lse(topics[d, :])

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diWordDistSums[:, np.newaxis]))

        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

        # Next the link-topic assignments, note this is a PxK matrix
        linkIdx, y = _infer_link_topics_at_d(d, L, topics, lse_at_k)

        # Here we _start_ with the entropy of y
        ent_y -= np.dot(L[d, :].data, y * safe_log(y)).sum()

        # E[ln p(Y|topics) = sum_d sum_m sum_k E[y_dmk] E[ln topicDist_dk]
        y *= L[d, :].data[:, np.newaxis]
        prob_y += y.dot(topics[d, :].T).sum()
        prob_y -= out_counts[d] * lse(topics[d, :])

        # E[ln p(L|Y)] = sum_d sum_m sum_k sum_t E[y_dmk] l_dmp E[ln topics_pk]
        prob_links += y.dot(topics[linkIdx, :].T).sum()
        prob_links -= y.dot(lse_at_k).sum()


    bound += (prob_z + ent_z + prob_words)
    bound += (prob_y + ent_y + prob_links)

    return bound
Esempio n. 15
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype

    W = data.words

    iters, bnds, likes = [], [], []

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError("Input document-term matrix contains at least one document with no words")
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    for itr in range(iterations):
        # E-Step
        safe_log(wordDists, out=wordDists)
        safe_log(corpusTopicDist, out=corpusTopicDist)

        topicDists = W.dot(wordDists.T) + corpusTopicDist[np.newaxis, :]
        #topicDists -= topicDists.max(axis=1)[:, np.newaxis] # TODO Ensure this is okay
        norms = fns.logsumexp(topicDists, axis=1)
        topicDists -= norms[:, np.newaxis]

        np.exp(topicDists, out=topicDists)

        # M-Step
        wordDists = (W.T.dot(topicDists)).T
        wordDists += vocabPrior
        wordDists /= wordDists.sum(axis=1)[:, np.newaxis]

        corpusTopicDist     = topicDists.sum(axis=0)
        corpusTopicDist[:] += topicPrior
        corpusTopicDist    /= corpusTopicDist.sum()

        if itr % logFrequency == 0 or debug:
            m = ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name)
            q = QueryState(query.docLens, topicDists, True)

            iters.append(itr)
            bnds.append(var_bound(data, m, q))
            likes.append(log_likelihood(data, m, q))

            perp = perplexity_from_like(likes[-1], W.sum())
            print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" % (itr, perp, bnds[-1]))

            if len(iters) > 2 and iters[-1] > 50:
                lastPerp = perplexity_from_like(likes[-2], W.sum())
                if lastPerp - perp < 1:
                    break;

    return ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name), \
           QueryState(query.docLens, topicDists, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))