def var_bound(data, modelState, queryState):
    """
    Determines the variational bounds. Values are mutated in place, but are
    reset afterwards to their initial values. So it's safe to call in repeatedly.
    """

    # Unpack the the structs, for ease of access and efficiency
    W, X = data.words, data.feats
    D, T, F = W.shape[0], W.shape[1], X.shape[1]
    means, docLens = queryState.means, queryState.docLens
    K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = (
        modelState.K,
        modelState.A,
        modelState.U,
        modelState.Y,
        modelState.V,
        modelState.covA,
        modelState.tv,
        modelState.ltv,
        modelState.fv,
        modelState.lfv,
        modelState.vocab,
        modelState.vocabPrior,
        modelState.dtype,
    )

    H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K)
    Log2Pi = log(2 * pi)

    bound = 0

    # U and V are parameters with no distribution

    #
    # Y has a normal distribution, it's covariance is unfortunately an expensive computation
    #
    P, Q = U.shape[1], V.shape[1]
    covY = np.eye(P * Q) * (lfv * ltv)
    covY += np.kron(V.T.dot(V), U.T.dot(U))
    covY = la.inv(covY, overwrite_a=True)

    # The expected likelihood of Y
    bound -= 0.5 * P * Q * Log2Pi
    bound -= 0.5 * P * Q * log(ltv * lfv)
    bound -= 0.5 / (lfv * ltv) * np.sum(Y * Y)  # 5x faster than np.trace(Y.dot(Y.T))
    bound -= 0.5 * np.trace(covY) * (lfv * ltv)
    # the traces of the posterior+prior covariance products cancel out across likelihoods

    # The entropy of Y
    bound += 0.5 * P * Q * (Log2Pi + 1) + 0.5 * safe_log_det(covY)

    #
    # A has a normal distribution/
    #
    F, K = A.shape[0], A.shape[1]
    diff = A - U.dot(Y).dot(V.T)
    diff *= diff

    # The expected likelihood of A
    bound -= 0.5 * K * F * Log2Pi
    bound -= 0.5 * K * F * log(tv * fv)
    bound -= 0.5 / (fv * tv) * np.sum(diff)

    # The entropy of A
    bound += 0.5 * F * K * (Log2Pi + 1) + 0.5 * K * safe_log_det(covA)

    #
    # Theta, the matrix of means, has a normal distribution. Its row-covarince is diagonal
    # (i.e. it's several independent multi-var normal distros). The posterior is made
    # up of D K-dimensional normals with diagonal covariances
    #
    # We iterate through the topics in batches, to control memory use
    batchSize = min(BatchSize, D)
    batchCount = ceil(D / batchSize)
    feats = np.ndarray(shape=(batchSize, F), dtype=dtype)
    tops = np.ndarray(shape=(batchSize, K), dtype=dtype)
    trace = 0
    for b in range(0, batchCount):
        start = b * batchSize
        end = min(start + batchSize, D)
        batchSize = min(batchSize, end - start)

        feats[:batchSize, :] = X[start:end, :].toarray()
        np.dot(feats[:batchSize, :], A, out=tops[:batchSize, :])
        tops[:batchSize, :] -= means[start:end, :]
        tops[:batchSize, :] *= tops[:batchSize, :]
        trace += np.sum(tops[:batchSize, :])
    feats = None

    # The expected likelihood of the topic-assignments
    bound -= 0.5 * D * K * Log2Pi
    bound -= 0.5 * D * K * log(tv)
    bound -= 0.5 / tv * trace

    bound -= 0.5 * tv * np.sum(covA)  # this trace doesn't cancel as we
    # don't have a posterior on tv
    # The entropy of the topic-assignments
    bound += 0.5 * D * K * (Log2Pi + 1) + 0.5 * np.sum(covA)

    # Distribution over word-topic assignments and words and the formers
    # entropy. This is somewhat jumbled to avoid repeatedly taking the
    # exp and log of the means
    # Again we batch this for safety
    batchSize = min(BatchSize, D)
    batchCount = ceil(D / batchSize)
    V = np.ndarray(shape=(batchSize, K), dtype=dtype)
    for b in range(0, batchCount):
        start = b * batchSize
        end = min(start + batchSize, D)
        batchSize = min(batchSize, end - start)

        meansBatch = means[start:end, :]
        docLensBatch = docLens[start:end]

        np.exp(meansBatch - meansBatch.max(axis=1)[:, np.newaxis], out=tops[:batchSize, :])
        expMeansBatch = tops[:batchSize, :]
        R = sparseScalarQuotientOfDot(
            W, expMeansBatch, vocab, start=start, end=end
        )  # BatchSize x V:   [W / TB] is the quotient of the original over the reconstructed doc-term matrix
        V[:batchSize, :] = expMeansBatch * (R[:batchSize, :].dot(vocab.T))  # BatchSize x K
        VBatch = V[:batchSize, :]

        bound += np.sum(docLensBatch * np.log(np.sum(expMeansBatch, axis=1)))
        bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeansBatch, vocab, start=start, end=end).data)

        bound += np.sum(meansBatch * VBatch)
        bound += np.sum(2 * ssp.diags(docLensBatch, 0) * meansBatch.dot(H) * meansBatch)
        bound -= 2.0 * scaledSelfSoftDot(meansBatch, docLensBatch)
        bound -= 0.5 * np.sum(docLensBatch[:, np.newaxis] * VBatch * (np.diag(H))[np.newaxis, :])

        bound -= np.sum(meansBatch * VBatch)

    return bound
def var_bound(data, modelState, queryState, XTX=None):
    '''
    Determines the variational bounds. Values are mutated in place, but are
    reset afterwards to their initial values. So it's safe to call in a serial
    manner.
    '''
    
    # Unpack the the structs, for ease of access and efficiency
    W, X = data.words, data.feats
    D, _ = W.shape
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.Ab, modelState.dtype
    
    # Calculate some implicit  variables
    isigT = la.inv(sigT)
    lnDetSigT = lnDetOfDiagMat(sigT)
    verifyProper(lnDetSigT, "lnDetSigT")
    
    if XTX is None:
        XTX = X.T.dot(X)
    
    bound = 0
    
    # Distribution over latent space
    bound -= (P*K)/2. * LN_OF_2_PI
    bound -= P * lnDetSigT
    bound -= K * P * log(lfv)
    bound -= 0.5 * np.sum(1./lfv * isigT.dot(Y) * Y)
    bound -= 0.5 * K * np.trace(R_Y)
    
    # And its entropy
    detR_Y = safeDet(R_Y, "R_Y")
    bound += 0.5 * LN_OF_2_PI_E + P/2. * lnDetSigT + K/2. * log(detR_Y)
    
    # Distribution over mapping from features to topics
    diff   = (A - Y.dot(V))
    bound -= (F*K)/2. * LN_OF_2_PI
    bound -= F * lnDetSigT
    bound -= K * P * log(fv)
    bound -= 0.5 * np.sum (1./lfv * isigT.dot(diff) * diff)
    bound -= 0.5 * K * np.trace(R_A)
    
    # And its entropy
    detR_A = safeDet(R_A, "R_A")
    bound += 0.5 * LN_OF_2_PI_E + F/2. * lnDetSigT + K/2. * log(detR_A)
    
    # Distribution over document topics
    bound -= (D*K)/2. * LN_OF_2_PI
    bound -= D/2. * lnDetSigT
    diff   = means - X.dot(A.T)
    bound -= 0.5 * np.sum (diff.dot(isigT) * diff)
    bound -= 0.5 * np.sum(varcs * np.diag(isigT)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only.
    bound -= 0.5 * K * np.trace(XTX.dot(R_A))
       
    # And its entropy
    bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) 
        
    # Distribution over word-topic assignments, and their entropy
    # and distribution over words. This is re-arranged as we need 
    # means for some parts, and exp(means) for other parts
    expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab)  # D x V   [W / TB] is the quotient of the original over the reconstructed doc-term matrix
    S = expMeans * (R.dot(vocab.T)) # D x K
    
    bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1)))
    bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data)

    bound += np.sum(means * S)
    bound += np.sum(2 * ssp.diags(docLens,0) * means.dot(Ab) * means)
    bound -= 2. * scaledSelfSoftDot(means, docLens)
    bound -= 0.5 * np.sum(docLens[:,np.newaxis] * S * (np.diag(Ab))[np.newaxis,:])
    
    bound -= np.sum(means * S) 
    
    return bound
Esempio n. 3
0
def var_bound(data, modelState, queryState):
    '''
    Determines the variational bounds. Values are mutated in place, but are
    reset afterwards to their initial values. So it's safe to call in a serial
    manner.
    '''
    
    # Unpack the the structs, for ease of access and efficiency
    W   = data.words
    D,_ = W.shape
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A
    
    # Calculate some implicit  variables
    isigT = la.inv(sigT)
    
    bound = 0
    
    if USE_NIW_PRIOR:
        pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
        pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR

        # distribution over topic covariance
        bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI)
        bound -= 0.5 * K * pseudoObsVar * log(2)
        bound -= fns.multigammaln(pseudoObsVar / 2., K)
        bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(sigT)
        bound += 0.5 * NIW_PSI * np.trace(isigT)

        # and its entropy
        # is a constant which we skip
        
        # distribution over means
        bound -= 0.5 * K * log(1./pseudoObsMeans) * safe_log_det(sigT)
        bound -= 0.5 / pseudoObsMeans * (topicMean).T.dot(isigT).dot(topicMean)
        
        # and its entropy
        bound += 0.5 * safe_log_det(sigT) # +  a constant
        
    
    # Distribution over document topics
    bound -= (D*K)/2. * LN_OF_2_PI
    bound -= D/2. * la.det(sigT)
    diff   = means - topicMean[np.newaxis,:]
    bound -= 0.5 * np.sum (diff.dot(isigT) * diff)
    bound -= 0.5 * np.sum(varcs * np.diag(isigT)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only.
       
    # And its entropy
#     bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) 
    
    # Distribution over word-topic assignments and words and the formers
    # entropy. This is somewhat jumbled to avoid repeatedly taking the
    # exp and log of the means
    expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab)  # D x V   [W / TB] is the quotient of the original over the reconstructed doc-term matrix
    V = expMeans * (R.dot(vocab.T)) # D x K
    
    bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1)))
    bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data)
    
    bound += np.sum(means * V)
    bound += np.sum(2 * ssp.diags(docLens,0) * means.dot(A) * means)
    bound -= 2. * scaledSelfSoftDot(means, docLens)
    bound -= 0.5 * np.sum(docLens[:,np.newaxis] * V * (np.diag(A))[np.newaxis,:])
    
    bound -= np.sum(means * V) 
    
    
    return bound
Esempio n. 4
0
def var_bound(data, modelState, queryState):
    '''
    Determines the variational bounds. Values are mutated in place, but are
    reset afterwards to their initial values. So it's safe to call in a serial
    manner.
    '''
    
    # Unpack the the structs, for ease of access and efficiency
    W, L, X  = data.words, data.links, data.feats
    D,_ = W.shape
    means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens
    K, topicMean, topicCov, vocab, A = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A
    
    # Calculate some implicit  variables
    itopicCov = la.inv(topicCov)
    
    bound = 0

    expMeansOut = np.exp(means - means.max(axis=1)[:, np.newaxis])
    expMeansIn  = np.exp(means - means.max(axis=0)[np.newaxis, :])
    lse_at_k    = expMeansIn.sum(axis=0)
    
    if USE_NIW_PRIOR:
        pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
        pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR

        # distribution over topic covariance
        bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI)
        bound -= 0.5 * K * pseudoObsVar * log(2)
        bound -= fns.multigammaln(pseudoObsVar / 2., K)
        bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(topicCov)
        bound += 0.5 * NIW_PSI * np.trace(itopicCov)

        # and its entropy
        # is a constant which we skip
        
        # distribution over means
        bound -= 0.5 * K * log(1./pseudoObsMeans) * safe_log_det(topicCov)
        bound -= 0.5 / pseudoObsMeans * (topicMean).T.dot(itopicCov).dot(topicMean)
        
        # and its entropy
        bound += 0.5 * safe_log_det(topicCov) # +  a constant
        
    
    # Distribution over document topics
    bound -= (D*K)/2. * LN_OF_2_PI
    bound -= D/2. * la.det(topicCov)
    diff   = means - topicMean[np.newaxis,:]
    bound -= 0.5 * np.sum (diff.dot(itopicCov) * diff)
    bound -= 0.5 * np.sum(varcs * np.diag(itopicCov)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only.
       
    # And its entropy
#     bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) 


    # Distribution over word-topic assignments and words and the formers
    # entropy, and similaarly for out-links. This is somewhat jumbled to
    # avoid repeatedly taking the exp and log of the means
    W_weights  = sparseScalarQuotientOfDot(W, expMeansOut, vocab)  # D x V   [W / TB] is the quotient of the original over the reconstructed doc-term matrix
    w_top_sums = expMeansOut * (W_weights.dot(vocab.T)) # D x K

    L_weights  = sparseScalarQuotientOfNormedDot(L, expMeansOut, expMeansIn, lse_at_k)
    l_top_sums = L_weights.dot(expMeansIn) / lse_at_k[np.newaxis, :] * expMeansOut
    
    bound += np.sum(docLens * np.log(np.sum(expMeansOut, axis=1)))
    bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeansOut, vocab).data)
    # means = np.log(expMeans, out=expMeans)
    #means = safe_log(expMeansOut, out=means)
    
    bound += np.sum(means * w_top_sums)
    bound += np.sum(2 * ssp.diags(docLens,0) * means.dot(A) * means)
    bound -= 2. * scaledSelfSoftDot(means, docLens)
    bound -= 0.5 * np.sum(docLens[:,np.newaxis] * w_top_sums * (np.diag(A))[np.newaxis,:])
    
    bound -= np.sum(means * w_top_sums)
    
    
    return bound