Beispiel #1
0
def _debug_with_bound (itr, var_value, var_name, W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n):
    if np.isnan(var_value).any():
        printStderr ("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr ("WARNING: " + var_name + " contains INFs")
    if var_value.dtype != dtype:
        printStderr ("WARNING: dtype(" + var_name + ") = " + str(var_value.dtype))
    
    old_bound = _debug_with_bound.old_bound
    bound     = var_bound(DataSet(W), ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), QueryState(means, means.copy(), varcs, n))
    diff = "" if old_bound == 0 else "%15.4f" % (bound - old_bound)
    _debug_with_bound.old_bound = bound
    
    addendum = ""
    if var_name == "sigT":
        try:
            addendum = "det(sigT) = %g" % (la.det(sigT))
        except:
            addendum = "det(sigT) = <undefined>"
    
    if isnan(bound):
        printStderr ("Bound is NaN")
    elif int(bound - old_bound) < 0:
        printStderr ("Iter %3d Update %-15s Bound %22f (%15s)     %s" % (itr, var_name, bound, diff, addendum)) 
    else:
        print ("Iter %3d Update %-15s Bound %22f (%15s)     %s" % (itr, var_name, bound, diff, addendum)) 
Beispiel #2
0
def _debug_with_bound (itr, var_value, var_name, data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, n):
    if np.isnan(var_value).any():
        printStderr ("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr ("WARNING: " + var_name + " contains INFs")
    if "dtype" in dir(var_value) and var_value.dtype != dtype:
        printStderr ("WARNING: dtype(" + var_name + ") = " + str(var_value.dtype))

    model = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, False, dtype, MODEL_NAME)
    query = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, n)

    old_bound = _debug_with_bound.old_bound
    bound     = var_bound(data, model, query)
    diff = "" if old_bound == 0 else "%15.4f" % (bound - old_bound)
    _debug_with_bound.old_bound = bound
    
    addendum = ""
    if var_name == "topicCov":
        try:
            addendum = "log det(topicCov) = %g" % (np.log(la.det(topicCov)))
        except:
            addendum = "log det(topicCov) = <undefined>"
    
    if isnan(bound):
        printStderr ("Bound is NaN")
    else:
        perp = perplexity_from_like(log_likelihood(data, model, query), data.word_count)
        if int(bound - old_bound) < 0:
            printStderr ("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f)     %s" % (itr, var_name, bound, diff, perp, addendum))
        else:
            print ("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f)  %s" % (itr, var_name, bound, diff, perp, addendum))
def _debug_with_bound(
    itr, var_value, var_name, W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior
):
    if np.isnan(var_value).any():
        printStderr("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr("WARNING: " + var_name + " contains INFs")
    dtype = A.dtype

    old_bound = _debug_with_bound.old_bound
    data = DataSet(W, X)
    model = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, MODEL_NAME)
    query = QueryState(means, docLens)
    bound = var_bound(data, model, query)
    diff = "" if old_bound == 0 else "%15.4f" % (bound - old_bound)
    _debug_with_bound.old_bound = bound

    addendum = ""

    perp = np.exp(-log_likelihood(data, model, query) / data.word_count)

    if isnan(bound):
        printStderr("Bound is NaN")
    elif int(bound - old_bound) < 0:
        printStderr(
            "Iter %3d Update %-15s Bound %22f (%15s) Perplexity %5.1f     %s"
            % (itr, var_name, bound, diff, perp, addendum)
        )
    else:
        print(
            "Iter %3d Update %-15s Bound %22f (%15s) Perplexity %5.1f     %s"
            % (itr, var_name, bound, diff, perp, addendum)
        )
Beispiel #4
0
def _debug_with_bound (itr, var_value, var_name, W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n):
    if np.isnan(var_value).any():
        printStderr ("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr ("WARNING: " + var_name + " contains INFs")
    if var_value.dtype != dtype:
        printStderr ("WARNING: dtype(" + var_name + ") = " + str(var_value.dtype))
    
    modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype, MODEL_NAME)
    queryState = QueryState(means, means.copy(), varcs, n)
    
    old_bound = _debug_with_bound.old_bound
    bound     = var_bound(DataSet(W, feats=X), modelState, queryState, XTX)
    likely    = log_likelihood(DataSet(W, feats=X), modelState, queryState)
    diff = "" if old_bound == 0 else "%11.2f" % (bound - old_bound)
    _debug_with_bound.old_bound = bound
    
    if isnan(bound) or int(bound - old_bound) < 0:
        printStderr ("Iter %3d Update %-10s Bound %15.2f (%11s    ) Perplexity %4.2f" % (itr, var_name, bound, diff, np.exp(-likely/W.sum())))
    else:
        print ("Iter %3d Update %-10s Bound %15.2f (%11s) Perplexity %4.2f" % (itr, var_name, bound, diff, np.exp(-likely/W.sum())))
def train(data, modelState, queryState, trainPlan):
    """
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    """
    W, X = data.words, data.feats
    D, T = W.shape
    F = X.shape[1]

    # tmpNumDense = np.array([
    #     4	, 8	, 2	, 0	, 0,
    #     0	, 6	, 0	, 17, 0,
    #     12	, 13	, 1	, 7	, 8,
    #     0	, 5	, 0	, 0	, 0,
    #     0	, 6	, 0	, 0	, 44,
    #     0	, 7	, 2	, 0	, 0], dtype=np.float64).reshape((6,5))
    # tmpNum = ssp.csr_matrix(tmpNumDense)
    #
    # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10
    # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64)
    #
    # tmpResult = tmpNum.copy()
    # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight)
    #
    # print (str(tmpNum.todense()))
    # print (str(tmpDenomleft.dot(tmpDenomRight)))
    # print (str(tmpResult.todense()))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = (
        trainPlan.iterations,
        trainPlan.epsilon,
        trainPlan.logFrequency,
        trainPlan.fastButInaccurate,
        trainPlan.debug,
    )
    means, docLens = queryState.means, queryState.docLens
    K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = (
        modelState.K,
        modelState.A,
        modelState.U,
        modelState.Y,
        modelState.V,
        modelState.covA,
        modelState.tv,
        modelState.ltv,
        modelState.fv,
        modelState.lfv,
        modelState.vocab,
        modelState.vocabPrior,
        modelState.dtype,
    )

    tp, fp, ltp, lfp = 1.0 / tv, 1.0 / fv, 1.0 / ltv, 1.0 / lfv  # turn variances into precisions

    # FIXME Use passed in hypers
    print("tp = %f tv=%f" % (tp, tv))
    vocabPrior = np.ones(shape=(T,), dtype=modelState.dtype)

    # FIXME undo truncation
    F = 363
    A = A[:F, :]
    X = X[:, :F]
    U = U[:F, :]
    data = DataSet(words=W, feats=X)

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    debugFn = _debug_with_bound if debug else _debug_with_nothing

    # Initialize some working variables
    if covA is None:
        precA = (fp * ssp.eye(F) + X.T.dot(X)).todense()  # As the inverse is almost always dense
        covA = la.inv(precA, overwrite_a=True)  # it's faster to densify in advance
    uniqLens = np.unique(docLens)

    debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

    H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K)

    expMeans = means.copy()
    expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy())

    lhs = H.copy()
    rhs = expMeans.copy()
    Y_rhs = Y.copy()

    # Iterate over parameters
    for itr in range(iterations):

        # Update U, V given A
        V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T
        V /= V[0, 0]
        U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T

        # Update Y given U, V, A
        Y_rhs[:, :] = U.T.dot(A).dot(V)

        Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True)
        Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True)

        s = np.outer(Sv, Su).flatten()
        s += ltv * lfv
        np.reciprocal(s, out=s)

        M = Uu.T.dot(Y_rhs).dot(Uv)
        M *= unvec(s, row_count=M.shape[0])

        Y = Uu.dot(M).dot(Uv.T)
        debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

        A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means))
        debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # TODO One big sort by size, plus batch it.

        # Update the Means

        rhs[:, :] = expMeans
        rhs *= R.dot(vocab.T)
        rhs += X.dot(A) * tp
        rhs += docLens[:, np.newaxis] * means.dot(H)
        rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)
        for l in uniqLens:
            inds = np.where(docLens == l)[0]
            lhs[:, :] = l * H
            lhs[np.diag_indices_from(lhs)] += tp
            lhs[:, :] = la.inv(lhs)
            means[inds, :] = rhs[inds, :].dot(lhs)  # left and right got switched going from vectors to matrices :-/

        debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

        # Standard deviation
        # DK        = means.shape[0] * means.shape[1]
        # newTp     = np.sum(means)
        # newTp     = (-newTp * newTp)
        # rhs[:,:]  = means
        # rhs      *= means
        # newTp     = DK * np.sum(rhs) - newTp
        # newTp    /= DK * (DK - 1)
        # newTp     = min(max(newTp, 1E-36), 1E+36)
        # tp        = 1 / newTp
        # if itr % logFrequency == 0:
        #     print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,)))))

        # Update the vocabulary
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)

        vocab *= (R.T.dot(expMeans)).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)

        debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)
        # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min()))

        # Update the vocab prior
        # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior)
        # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean()))

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name)
            queryState = QueryState(means, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime("%X")
                + " : Iteration %d: bound %f \t Perplexity: %.2f"
                % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))
            )
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug:
                        printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if (
                    itr > 100
                    and len(likelyValues) > 3
                    and abs(
                        perplexity_from_like(likelyValues[-1], docLens.sum())
                        - perplexity_from_like(likelyValues[-2], docLens.sum())
                    )
                    < 1.0
                ):
                    break

    return (
        ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name),
        QueryState(means, expMeans, docLens),
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues)),
    )
Beispiel #6
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats
    D,_ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens
    K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    if debug:
        debugFn = _debug_with_bound

        initLikely = log_likelihood(data, modelState, queryState)
        initPerp   = perplexity_from_like(initLikely, data.word_count)
        print ("Initial perplexity is: %.2f" % initPerp)
    else:
        debugFn = _debug_with_nothing

    # Initialize some working variables
    W_weight  = W.copy()
    L_weight  = L.copy()
    LT_weight = LT.copy()

    inDocCov,  inDocPre  = np.ones((D,)), np.ones((D,))

    # Interestingly, outDocCov trades off good perplexity fits
    # with good ranking fits. > 10 gives better perplexity and
    # worse ranking. At 10 both are good. Below 10 both get
    # worse. Below 0.5, convergence stalls after the first iter.
    outDocCov, outDocPre = 10, 1./10

    # Iterate over parameters
    for itr in range(iterations):
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the mean and covariance of the prior over out-topics
        topicMean = outMeans.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        outDiff = outMeans - topicMean[np.newaxis, :]
        inDiff =  inMeans - outMeans

        for _ in range(5): # It typically takes three iterations for the three dependant covariances -
                           # outDocCov, inDocCov and topicCov - to become consistent w.r.t each other
            topicCov  = (outDocPre * outDiff).T.dot(outDiff)
            topicCov += (inDocPre[:,np.newaxis] * inDiff).T.dot(inDiff)

            topicCov += np.diag(outVarcs.sum(axis=0))
            topicCov += np.diag(inVarcs.sum(axis=0))

            topicCov += IWISH_S_SCALE * np.eye(K)
            topicCov /= (2 * D + IWISH_DENOM)
            itopicCov = la.inv(topicCov)

            debugFn (itr, topicMean, "topicCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

            diffSig   = inDiff.dot(itopicCov)
            diffSig  *= inDiff

            inDocCov  = diffSig.sum(axis=1)
            inDocCov += (outVarcs * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1)
            inDocCov += (inVarcs  * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1)
            inDocCov += IGAMMA_B
            inDocCov /= (IGAMMA_A - 1 + K)
            inDocPre  = np.reciprocal(inDocCov)

            debugFn (itr, inDocCov, "inDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

            diffSig   = outDiff.dot(itopicCov)
            diffSig  *= outDiff
            # outDocCov = (IGAMMA_B + diffSig.sum() + (np.diagonal(itopicCov) * outVarcs).sum()) / (IGAMMA_A - 1 + (D * K))
            # outDocPre = 1./outDocCov

            debugFn (itr, outDocCov, "outDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)


        # Apply the exp function to get the (unnormalised) softmaxes in both directions.
        expMeansCol = np.exp(inMeans - inMeans.max(axis=0)[np.newaxis, :])
        lse_at_k = np.sum(expMeansCol, axis=0)
        F = 0.5 * inMeans \
          - (0.5/ D) * inMeans.sum(axis=0) \
          - expMeansCol / lse_at_k[np.newaxis, :]

        expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis])
        W_weight   = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)

        # Update the vocabularies

        vocab *= (W_weight.T.dot(expMeansRow)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += VocabPrior
        vocab = normalizerows_ip(vocab)

        docVocab = (expMeansCol / lse_at_k[np.newaxis, :]).T.copy() # FIXME Dupes line in definition of F

        # Recalculate w_top_sums with the new vocab and log vocab improvement
        W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        debugFn (itr, vocab, "vocab", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        # Now do likewise for the links, do it twice to model in-counts (first) and
        # out-counts (Second). The difference is the transpose
        LT_weight    = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight)
        l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow
        in_counts    = l_intop_sums.sum(axis=0)

        L_weight     = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight)
        l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow


        # Update the posterior variances
        outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:])
        debugFn (itr, outVarcs, "outVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        inVarcs = np.reciprocal(in_counts[np.newaxis,:] * (D-1)/(2*D) + inDocPre[:,np.newaxis] * np.diagonal(itopicCov)[np.newaxis,:])
        debugFn (itr, inVarcs, "inVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        # Update the out-means and in-means
        out_rhs  = w_top_sums.copy()
        out_rhs += l_outtop_sums
        out_rhs += itopicCov.dot(topicMean) / outDocCov
        out_rhs += inMeans.dot(itopicCov) / inDocCov[:,np.newaxis]
        out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans))

        scaled_n_in = ((D-1.)/(2*D)) * ssp.diags(in_counts, 0)
        in_rhs = (inDocPre[:, np.newaxis] * outMeans).dot(itopicCov)
        in_rhs += ((-inMeans.sum(axis=0) * in_counts) / (4*D))[np.newaxis,:]
        in_rhs += l_intop_sums
        in_rhs += in_counts[np.newaxis, :] * F
        for d in range(D):
            in_rhs[d, :]  += in_counts * inMeans[d, :] / (4*D)
            inMeans[d, :]  = la.inv(inDocPre[d] * itopicCov + scaled_n_in).dot(in_rhs[d, :])
            in_rhs[d,:]   -= in_counts * inMeans[d, :] / (4*D)

            try:
                outCov          = la.inv((outDocPre + inDocPre[d]) * itopicCov + emit_counts[d] * A)
                outMeans[d, :]  = outCov.dot(out_rhs[d,:])
            except la.LinAlgError as err:
                print ("ABORTING: " + str(err))
                return \
                    ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \
                    QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \
                    (np.array(boundIters), np.array(boundValues), np.array(likelyValues))


        debugFn (itr, outMeans, "inMeans/outMeans", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)
        # debugFn (itr, inMeans,  "inMeans",  data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME)
            queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

        # if True or debug or itr % logFrequency == 0:
        #     print("   Sigma     %6.1f  \t %9.3g, %9.3g, %9.3g" % (np.log(la.det(topicCov)), topicCov.min(), topicCov.mean(), topicCov.max()), end="  |")
        #     print("   rho       %6.1f  \t %9.3g, %9.3g, %9.3g" % (sum(log(inDocCov[d]) for d in range(D)), inDocCov.min(), inDocCov.mean(), inDocCov.max()), end="  |")
        #     print("   alpha     %6.1f  \t %9.3g" % (np.log(la.det(np.eye(K,) * outDocCov)), outDocCov), end="  |")
        #     print("   inMeans   %9.3g, %9.3g, %9.3g" % (inMeans.min(),  inMeans.mean(),  inMeans.max()), end="  |")
        #     print("   outMeans  %9.3g, %9.3g, %9.3g" % (outMeans.min(), outMeans.mean(), outMeans.max()), end="  |")
        #     print("   inVarcs   %6.1f  \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(inVarcs[d]))  for d in range(D)) / D, inVarcs.min(),  inVarcs.mean(),  inVarcs.max()), end="  |")
        #     print("   outVarcs  %6.1f  \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(outVarcs[d])) for d in range(D)) / D, outVarcs.min(), outVarcs.mean(), outVarcs.max()))

    return \
        ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \
        QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Beispiel #7
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W   = data.words
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    
    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigT_diag.fill (NIW_PSI)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis,:]
            sigT = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            sigT /= (D + pseudoObsVar - K)
        else:
            sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype)
            sigT += np.diag(varcs.mean(axis=0))
           
        if diagonalPriorCov:
            diag = np.diag(sigT)
            sigT = np.diag(diag)
            isigT = np.diag(1./ diag)
        else:
            isigT = la.inv(sigT)

        # FIXME Undo debug
        sigT  = np.eye(K)
        isigT = la.inv(sigT)
        
        debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
#        print("                sigT.det = " + str(la.det(sigT)))
        
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)

        debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the Variances: var_d = (2 N_d * A + isigT)^{-1}
        varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT))
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # Update the Means
        rhs = V.copy()
        rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :])
        
#         means -= (means[:,0])[:,np.newaxis]
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)
            
            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)
            
            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))
        
                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Beispiel #8
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    boundLikes = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0
    
    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible
    W = W[sortIdx,:] # deep sorted copy
    X = X[sortIdx,:]
    means, varcs = means[sortIdx,:], varcs[sortIdx,:]

    docLens = originalDocLens[sortIdx]
    
    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])
    
    # Initialize some working variables
    R = W.copy()
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    R_A = R_A.todense()      # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")


    diff_m_xa = (means-X.dot(A.T))
    means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa)

    expMeans = np.zeros((BatchSize, K), dtype=dtype)
    R = np.zeros((BatchSize, K), dtype=dtype)
    S = np.zeros((BatchSize, K), dtype=dtype)
    vocabScale = np.ones(vocab.shape, dtype=dtype)
    
    # Iterate over parameters
    batchIter = 0
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += means_cov_with_x_a
        sigT.flat[::K+1] += varcs.sum(axis=0)

        # As small numbers lead to instable inverse estimates, we use the
        # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
        # scales whenever we use the inverse of the unscaled covariance
        sigScale  = 1. / (P+D+F)
        isigScale = 1. / sigScale

        isigT = la.inv(sigT)
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the vocabulary
        # vocab *= vocabScale
        # vocab += vocabPrior
        # vocab = normalizerows_ip(vocab)
        # debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Finally update the parameter V
        V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        
        #
        # And now this is the E-Step
        # 
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the mapping from the features to topics
        A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the Variances
        varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)

        # Faster version?
        vocabScale[:,:] = 0
        means_cov_with_x_a[:,:] = 0
        for lenIdx in range(len(lens)):
            nd         = lens[lenIdx]
            start, end = inds[lenIdx], inds[lenIdx + 1]
            lhs        = la.inv(isigT + sigScale * nd * Ab) * sigScale

            for d in range(start, end, BatchSize):
                end_d = min(d + BatchSize, end)
                span  = end_d - d

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab)
                S[:span,:] = expMeans[:span, :] * R.dot(vocab.T)

                # Convert expMeans to a softmax(means)
                expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis]

                mu   = X[d:end_d,:].dot(A.T)
                rhs  = mu.dot(isigT) * isigScale
                rhs += S[:span,:]
                rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab)
                rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means)

                means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R)

                stepSize = (Tau + batchIter) ** -Kappa
                batchIter += 1

                # Do a gradient update of the vocab
                vocabScale = (R.T.dot(expMeans[:span,:])).T
                vocabScale *= vocab
                normalizerows_ip(vocabScale)
                # vocabScale += vocabPrior
                vocabScale *= stepSize
                vocab *= (1 - stepSize)
                vocab += vocabScale

                diff = (means[d:end_d,:] - mu)
                means_cov_with_x_a += diff.T.dot(diff)

#       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX)
            boundLikes[bvIdx]  = log_likelihood(DataSet(W, feats=X), modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvement in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 20:
                lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx)
                    break
            bvIdx += 1
        
    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means       = means[revert_sort,:]
    varcs       = varcs[revert_sort,:]
    docLens     = docLens[revert_sort]
    
    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
Beispiel #9
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats
    D,_ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens
    K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    if debug:
        debugFn = _debug_with_bound

        initLikely = log_likelihood(data, modelState, queryState)
        initPerp   = perplexity_from_like(initLikely, data.word_count)
        print ("Initial perplexity is: %.2f" % initPerp)
    else:
        debugFn = _debug_with_nothing

    # Initialize some working variables
    W_weight  = W.copy()
    L_weight  = L.copy()
    LT_weight = LT.copy()

    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigT_diag.fill (NIW_PSI)

    # Iterate over parameters
    for itr in range(iterations):

        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens)

        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis,:]
            topicCov = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            topicCov /= (D + pseudoObsVar - K)
        else:
            topicCov = np.cov(means.T) if topicCov.dtype == np.float64 else np.cov(means.T).astype(dtype)
            topicCov += np.diag(varcs.mean(axis=0))

        if diagonalPriorCov:
            diag = np.diag(topicCov)
            topicCov = np.diag(diag)
            itopicCov = np.diag(1./ diag)
        else:
            itopicCov = la.inv(topicCov)

        debugFn (itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens)
#        print("                topicCov.det = " + str(la.det(topicCov)))

        # Building Blocks - temporarily replaces means with exp(means)
        expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :])
        lse_at_k = np.sum(expMeansCol, axis=0)
        F = 0.5 * means \
          - (1. / (2*D + 2)) * means.sum(axis=0) \
          - expMeansCol / lse_at_k[np.newaxis, :]

        expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis])
        W_weight   = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)

        # Update the vocabularies

        vocab *= (W_weight.T.dot(expMeansRow)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += VocabPrior
        vocab = normalizerows_ip(vocab)

        docVocab = (expMeansCol / lse_at_k[np.newaxis, :]).T # FIXME Dupes line in definitino of F

        # Recalculate w_top_sums with the new vocab and log vocab improvement
        W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        debugFn (itr, vocab, "vocab", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens)

        # Now do likewise for the links, do it twice to model in-counts (first) and
        # out-counts (Second). The difference is the transpose
        LT_weight    = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight)
        l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow
        in_counts    = l_intop_sums.sum(axis=0)

        L_weight     = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight)
        l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow

        # Reset the means and use them to calculate the weighted sum of means
        meanSum = means.sum(axis=0) * in_counts

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1}
        varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1./K) + np.diagonal(topicCov))
        debugFn (itr, varcs, "varcs", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens)

        # Update the Means
        rhs  = w_top_sums.copy()
        rhs += l_intop_sums
        rhs += l_outtop_sums
        rhs += itopicCov.dot(topicMean)
        rhs += emit_counts[:, np.newaxis] * (means.dot(A) - rowwise_softmax(means))
        rhs += in_counts[np.newaxis, :] * F
        if diagonalPriorCov:
            raise ValueError("Not implemented")
        else:
            for d in range(D):
                rhs_         = rhs[d, :] + (1. / (4 * D + 4)) * (meanSum - in_counts * means[d, :])
                means[d, :]  = la.inv(itopicCov + emit_counts[d] * A + np.diag(D * in_counts / (2 * D + 2))).dot(rhs_)
                if np.any(np.isnan(means[d, :])) or np.any (np.isinf(means[d, :])):
                    pass

                if np.any(np.isnan(np.exp(means[d, :] - means[d, :].max()))) or np.any (np.isinf(np.exp(means[d, :] - means[d, :].max()))):
                    pass

        debugFn (itr, means, "means", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME)
            queryState = QueryState(means, varcs, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if False and itr > 100 and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break


    return \
        ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \
        QueryState(means, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))