Ejemplo n.º 1
0
def newQueryState(data, modelState):
    '''
    Creates a new CTM Query state object. This contains all
    parameters and random variables tied to individual
    datapoints.
    
    Param:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the model state object
    
    REturn:
    A CtmQueryState object
    '''
    K, vocab, dtype =  modelState.K, modelState.vocab, modelState.dtype
    
    D,T = data.words.shape
    assert T == vocab.shape[1], "The number of terms in the document-term matrix (" + str(T) + ") differs from that in the model-states vocabulary parameter " + str(vocab.shape[1])
    docLens = np.squeeze(np.asarray(data.words.sum(axis=1)))

    base     = normalizerows_ip(rd.random((D,K*2)).astype(dtype))
    means    = base[:,:K]
    expMeans = base[:,K:]
    varcs    = np.ones((D,K), dtype=dtype)
    
    return QueryState(means, expMeans, varcs, docLens)
Ejemplo n.º 2
0
def newVbModelState(K, Q, F, P, T, featVar = 0.01, topicVar = 0.01, latFeatVar = 1, latTopicVar = 1):
    '''
    Creates a new model state object for a topic model based on side-information. This state
    contains all parameters that o§nce trained can be kept fixed for querying.
    
    The parameters are
    
    K - the number of topics
    Q - the number of latent topics, Q << K
    F - the number of features
    P - the number of latent features in the projected space, P << F
    T - the number of terms in the vocabulary
    topicVar - a scalar providing the isotropic covariance of the topic-space
    featVar - a scalar providing the isotropic covariance of the feature-space
    latFeatVar - a scalar providing the isotropic covariance of the latent feature-space
    latTopicVar - a scalar providing the isotropic covariance of the latent topic-space
    
    
    The returned object will contain K, Q, F, P and T and also
    
    A      - the mean of the KxF matrix mapping F features to K topics. 
    varA   - a vector containing the variance over the F features of the distribution over A
    Y      - the latent space which is mixed by U and V into the observed space
    omY    - the row variance of the distribution over Y
    sigY   - the column variance of the distribution over Y
    U      - the KxQ transformation from the K dimensional observed topic space to the
             Q-dimensional topic space
    V      - the FxP transformation from the F-dimensinal observed features space to the
             latent P-dimensional feature-space
    vocab  - The K x V matrix of vocabulary distributions.
    tau    - the row variance of A is tau^2 I_K
    sigma  - the variance in the estimation of the topic memberships. lambda ~ N(A'x, sigma^2I)
    '''
    
    Y     = rd.random((Q,P)).astype(DTYPE)
    omY   = latFeatVar * np.identity(P, DTYPE)
    sigY  = latTopicVar * np.identity(Q, DTYPE)
    
    sigT  = ssp.eye(K, dtype=DTYPE)
    
    U     = rd.random((K,Q)).astype(DTYPE)
    V     = rd.random((F,P)).astype(DTYPE)
    
    A     = U.dot(Y).dot(V.T)
    varA  = featVar * np.identity(F, DTYPE)
    
    varRatio = (featVar * topicVar) / (latFeatVar * latTopicVar)
    if varRatio > 1:
        raise ValueError ("Model will not converge as (featVar * topicVar) / (latFeatVar * latTopicVar)) = " + str(varRatio) + "  when it needs to be no more than one.")
    
    # Vocab is K word distributions so normalize
    vocab = normalizerows_ip (rd.random((K, T)).astype(DTYPE)) + sys.float_info.epsilon
    
    return VbSideTopicModelState(K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, topicVar, featVar, latTopicVar, latFeatVar)
Ejemplo n.º 3
0
def newVbModelState(K, F, T, P):
    '''
    Creates a new model state object for a topic model based on side-information. This state
    contains all parameters that once trained can be kept fixed for querying.
    
    The parameters are
    
    K - the number of topics
    F - the number of features
    P - the number of features in the projected space, P << F
    T - the number of terms in the vocabulary
    
    The returned object will contain K, F, V and P and also
    
    A      - the mean of the F x K matrix mapping F features to K topics
    varA   - the column variance of the distribution over A
    tau    - the row variance of A is tau^2 I_K
    V      - the mean of the P x K matrix mapping P projected features to K topics
    varV   - the column variance of the distribution over V (the row variance is again
             tau^2 I_K
    U      - the F x P projection matrix, such that A = UV
    sigma  - the variance in the estimation of the topic memberships lambda ~ N(A'x, sigma^2I)
    vocab  - The K x V matrix of vocabulary distributions.
    '''
    
    V     = rd.random((P, K))
    varV  = np.identity(P, np.float64)
    U     = rd.random((F, P))
    A     = U.dot(V)
    varA  = np.identity(F, np.float64)
    tau   = 0.1
    sigma = 0.1
    
    # Vocab is K word distributions so normalize
    vocab = normalizerows_ip (rd.random((K, T)))
    
    return VbSideTopicModelState(K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab)
Ejemplo n.º 4
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W   = data.words
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    
    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigT_diag.fill (NIW_PSI)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis,:]
            sigT = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            sigT /= (D + pseudoObsVar - K)
        else:
            sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype)
            sigT += np.diag(varcs.mean(axis=0))
           
        if diagonalPriorCov:
            diag = np.diag(sigT)
            sigT = np.diag(diag)
            isigT = np.diag(1./ diag)
        else:
            isigT = la.inv(sigT)

        # FIXME Undo debug
        sigT  = np.eye(K)
        isigT = la.inv(sigT)
        
        debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
#        print("                sigT.det = " + str(la.det(sigT)))
        
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)

        debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the Variances: var_d = (2 N_d * A + isigT)^{-1}
        varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT))
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # Update the Means
        rhs = V.copy()
        rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :])
        
#         means -= (means[:,0])[:,np.newaxis]
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)
            
            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)
            
            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))
        
                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Ejemplo n.º 5
0
def train(modelState, X, W, plan):
    '''
    Creates a new query state object for a topic model based on side-information. 
    This contains all those estimated parameters that are specific to the actual
    date being queried - this must be used in conjunction with a model state.
    
    The parameters are
    
    modelState - the model state with all the model parameters
    X          - the D x F matrix of side information vectors
    W          - the D x V matrix of word **count** vectors.
    
    
    This returns a tuple of new model-state and query-state. The latter object will
    contain X and W and also
    
    s      - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk 
    lxi    - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the 
             quadratic term xi
    lambda - the topics we've inferred for the current batch of documents
    nu     - the variance of topics we've inferred (independent)
    '''
    # Unpack the model state tuple for ease of use and maybe speed improvements
    K, Q, F, P, T, A, _, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar
    iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate
    queryPlan = newInferencePlan(1, epsilon, logFrequency = 0, plot=False)
    
    if W.dtype.kind == 'i':      # for the sparseScalorQuotientOfDot() method to work
        W = W.astype(DTYPE)
    
    # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...)
    if logCount > 0:
        multiStepSize = np.power (iterations, 1. / logCount)
        logIter = 1
        elbos = []
        likes = []
        iters = []
    else:
        logIter = iterations + 1
    lastVarBoundValue = -sys.float_info.max
    
    # We'll need the total word count per doc, and total count of docs
    docLen = np.squeeze(np.asarray (W.sum(axis=1))) # Force to a one-dimensional array for np.newaxis trick to work
    D      = len(docLen)
    print ("Training %d topic model with %d x %d word-matrix W, %d x %d feature matrix X, and latent feature and topics spaces of size %d and %d respectively" % (K, D, T, D, F, P, Q))
    
    # No need to recompute this every time
    XTX = X.T.dot(X)
    
    # Identity matrices that occur
    I_P  = ssp.eye(P, dtype=DTYPE)
    I_Q  = ssp.eye(Q, dtype=DTYPE)
    I_QP = ssp.eye(Q*P,Q*P, dtype=DTYPE)
    
    # Assign initial values to the query parameters
    expLmda = np.exp(rd.random((D, K)).astype(DTYPE))
    nu   = np.ones((D, K), DTYPE)
    s    = np.zeros((D,), DTYPE)
    lxi  = negJakkola (np.ones((D,K), DTYPE))
    
    # If we don't bother optimising either tau or sigma we can just do all this here once only 
    overSsq = 1. / sigmaSq
    overAsq = 1. / alphaSq
    overKsq = 1. / kappaSq
    overTsq = 1. / tauSq
    
    varRatio = (alphaSq * sigmaSq) / (tauSq * kappaSq)
    
    # TODO the inverse being almost always dense means that it might
    # be faster to convert to dense and use the normal solver, despite
    # the size constraints.
#    varA = 1./K * sla.inv (overTsq * I_F + overSsq * XTX)
    print ("Inverting gram matrix")
    aI_XTX = (overAsq * ssp.eye(F, dtype=DTYPE) + XTX).todense()
    omA = la.inv (aI_XTX)
    scaledWordCounts = W.copy()
    
    # Set up a method to check at every update if we're going in the right
    # direction
    verify_and_log = _quickPrintElbo if DEBUG else _doNothing 
   
    print ("Launching inference")
    for iteration in range(iterations):
        
        # =============================================================
        # E-Step
        #   Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z)....
        #   Where lambda is the posterior mean of theta.
        # =============================================================
              
      
        # Y, sigY, omY
        #
        # If U'U is invertible, use inverse to convert Y to a Sylvester eqn
        # which has a much, much faster solver. Recall update for Y is of the form
        #   Y + AYB = C where A = U'U, B = V'V and C=U'AV
        # 
        VTV = V.T.dot(V)
        UTU = U.T.dot(U)
        
        sigY = la.inv(overTsq * overKsq * I_Q + overAsq * overSsq * UTU)
        verify_and_log ("E-Step: q(Y) [sigY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)
        
        omY  = la.inv(overTsq * overKsq * I_P + overAsq * overSsq * VTV) 
        verify_and_log ("E-Step: q(Y) [omY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)
        
        try:
            invUTU = la.inv(UTU)
            Y = la.solve_sylvester (varRatio * invUTU, VTV, invUTU.dot(U.T).dot(A).dot(V))   
        except np.linalg.linalg.LinAlgError as e: # U'U seems to rapidly become singular (before 5 iters)
            if fastButInaccurate:                 
                invUTU = la.pinvh(UTU) # Obviously unstable, inference stalls much earlier than the correct form
                Y = la.solve_sylvester (varRatio * invUTU, VTV, invUTU.dot(U.T).dot(A).dot(V))  
            else:
                Y = np.reshape (la.solve(varRatio * I_QP + np.kron(VTV, UTU), vec(U.T.dot(A).dot(V))), (Q,P), 'F')
                
        verify_and_log ("E-Step: q(Y) [Mean]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)
        
        
        # A 
        #
        # So it's normally A = (UYV' + L'X) omA with omA = inv(t*I_F + s*XTX)
        #   so A inv(omA) = UYV' + L'X
        #   so inv(omA)' A' = VY'U' + X'L
        # at which point we can use a built-in solve
        #
        lmda = np.log(expLmda, out=expLmda)
        A = omA.dot(X.T.dot(lmda) + overAsq * V.dot(Y.T).dot(U.T)).T
#         A = la.solve(aI_XTX, X.T.dot(lmda) + overAsq * V.dot(Y.T).dot(U.T)).T
        np.exp(expLmda, out=expLmda)
        verify_and_log ("E-Step: q(A)", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)
       
        # lmda_dk, nu_dk, s_d, and xi_dk
        #
        XAT = X.dot(A.T)
#         query (VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \
#                X, W, \
#                queryPlan, \
#                VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \
#                scaledWordCounts=scaledWordCounts, \
#                XAT = XAT)
       
       
        # =============================================================
        # M-Step
        #    Parameters for the softmax bound: lxi and s
        #    The projection used for A: U and V
        #    The vocabulary : vocab
        #    The variances: tau, sigma
        # =============================================================
        
        # vocab
        #
        sparseScalarQuotientOfDot(W, expLmda, vocab, out=scaledWordCounts)
        factor = (scaledWordCounts.T.dot(expLmda)).T # Gets materialized as a dense matrix...
        vocab *= factor
        normalizerows_ip(vocab)      
          
        # A hack to work around the fact that we've got no prior, and thus no
        # pseudo counts, so some values will collapse to zero
#         vocab[vocab < sys.float_info.min] = sys.float_info.min
        
        verify_and_log ("M-Step: vocab", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)

    
        # U
        # 
        U = A.dot(V).dot(Y.T).dot (la.inv(Y.dot(V.T).dot(V).dot(Y.T) + np.trace(omY.dot(V.T).dot(V)) * sigY))
        verify_and_log ("M-Step: U", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)

        # V
        # 
        V = A.T.dot(U).dot(Y).dot (la.inv(Y.T.dot(U.T).dot(U).dot(Y) + np.trace(sigY.dot(U.T).dot(U)) * omY))
        verify_and_log ("M-Step: V", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)

        # =============================================================
        # Handle logging of variational bound, likelihood, etc.
        # =============================================================
        if iteration == logIter:
            modelState = VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq)
            queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen)
            
            elbo   = varBound (modelState, queryState, X, W, None, XAT, XTX)
            likely = log_likelihood(modelState, X, W, queryState) #recons_error(modelState, X, W, queryState)
                
            elbos.append (elbo)
            iters.append (iteration)
            likes.append (likely)
            print ("\nIteration %5d  ELBO %15f   Log-Likelihood %15f" % (iteration, elbo, likely))
            
            logIter = min (np.ceil(logIter * multiStepSize), iterations - 1)
            
            if elbo < lastVarBoundValue:
                sys.stderr.write('ELBO going in the wrong direction\n')
            elif abs(elbo - lastVarBoundValue) < epsilon:
                break
            
            lastVarBoundValue = elbo
            if plot and plotIncremental:
                plot_bound(plotFile + "-iter-" + str(iteration), np.array(iters), np.array(elbos), np.array(likes))
        else:
            print('.', end='')
            sys.stdout.flush()
            
    
    # Right before we end, plot the evoluation of the bound and likelihood
    # if we've been asked to do so.
    if plot:
        plot_bound(plotFile, iters, elbos, likes)
    
    return VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \
           VbSideTopicQueryState (expLmda, nu, lxi, s, docLen)
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, T = W.shape
    F = X.shape[1]

    # tmpNumDense = np.array([
    #     4	, 8	, 2	, 0	, 0,
    #     0	, 6	, 0	, 17, 0,
    #     12	, 13	, 1	, 7	, 8,
    #     0	, 5	, 0	, 0	, 0,
    #     0	, 6	, 0	, 0	, 44,
    #     0	, 7	, 2	, 0	, 0], dtype=np.float64).reshape((6,5))
    # tmpNum = ssp.csr_matrix(tmpNumDense)
    #
    # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10
    # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64)
    #
    # tmpResult = tmpNum.copy()
    # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight)
    #
    # print (str(tmpNum.todense()))
    # print (str(tmpDenomleft.dot(tmpDenomRight)))
    # print (str(tmpResult.todense()))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, docLens = queryState.means, queryState.docLens
    K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \
        modelState.K, modelState.A, modelState.U, modelState.Y,  modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype

    tp, fp, ltp, lfp = 1. / tv, 1. / fv, 1. / ltv, 1. / lfv  # turn variances into precisions

    # FIXME Use passed in hypers
    print("tp = %f tv=%f" % (tp, tv))
    vocabPrior = np.ones(shape=(T, ), dtype=modelState.dtype)

    # FIXME undo truncation
    F = 363
    A = A[:F, :]
    X = X[:, :F]
    U = U[:F, :]
    data = DataSet(words=W, feats=X)

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    debugFn = _debug_with_bound if debug else _debug_with_nothing

    # Initialize some working variables
    if covA is None:
        precA = (fp * ssp.eye(F) +
                 X.T.dot(X)).todense()  # As the inverse is almost always dense
        covA = la.inv(precA,
                      overwrite_a=True)  # it's faster to densify in advance
    uniqLens = np.unique(docLens)

    debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv,
            ltv, fv, lfv, vocab, vocabPrior)

    H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K)

    expMeans = means.copy()
    expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy())

    lhs = H.copy()
    rhs = expMeans.copy()
    Y_rhs = Y.copy()

    # Iterate over parameters
    for itr in range(iterations):

        # Update U, V given A
        V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y),
                              A.T.dot(U).dot(Y).T).T
        V /= V[0, 0]
        U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T),
                              A.dot(V).dot(Y.T).T).T

        # Update Y given U, V, A
        Y_rhs[:, :] = U.T.dot(A).dot(V)

        Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True)
        Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True)

        s = np.outer(Sv, Su).flatten()
        s += ltv * lfv
        np.reciprocal(s, out=s)

        M = Uu.T.dot(Y_rhs).dot(Uv)
        M *= unvec(s, row_count=M.shape[0])

        Y = Uu.dot(M).dot(Uv.T)
        debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv,
                ltv, fv, lfv, vocab, vocabPrior)

        A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means))
        debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv,
                ltv, fv, lfv, vocab, vocabPrior)

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # TODO One big sort by size, plus batch it.

        # Update the Means

        rhs[:, :] = expMeans
        rhs *= R.dot(vocab.T)
        rhs += X.dot(A) * tp
        rhs += docLens[:, np.newaxis] * means.dot(H)
        rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)
        for l in uniqLens:
            inds = np.where(docLens == l)[0]
            lhs[:, :] = l * H
            lhs[np.diag_indices_from(lhs)] += tp
            lhs[:, :] = la.inv(lhs)
            means[inds, :] = rhs[inds, :].dot(
                lhs
            )  # left and right got switched going from vectors to matrices :-/

        debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA,
                tv, ltv, fv, lfv, vocab, vocabPrior)

        # Standard deviation
        # DK        = means.shape[0] * means.shape[1]
        # newTp     = np.sum(means)
        # newTp     = (-newTp * newTp)
        # rhs[:,:]  = means
        # rhs      *= means
        # newTp     = DK * np.sum(rhs) - newTp
        # newTp    /= DK * (DK - 1)
        # newTp     = min(max(newTp, 1E-36), 1E+36)
        # tp        = 1 / newTp
        # if itr % logFrequency == 0:
        #     print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,)))))

        # Update the vocabulary
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                          out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)

        vocab *= (
            R.T.dot(expMeans)
        ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)

        debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA,
                tv, ltv, fv, lfv, vocab, vocabPrior)
        # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min()))

        # Update the vocab prior
        # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior)
        # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean()))

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv,
                                    vocab, vocabPrior, dtype, modelState.name)
            queryState = QueryState(means, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime('%X') +
                " : Iteration %d: bound %f \t Perplexity: %.2f" %
                (itr, boundValues[-1],
                 perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug:
                        printStderr("ERROR: bound degradation: %f > %f" %
                                    (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), \
        QueryState(means, expMeans, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Ejemplo n.º 7
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want it)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats

    assert W.dtype == modelState.dtype
    assert X.dtype == modelState.dtype
    
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    likeValues  = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    
    _debug_with_bound.old_bound = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    sigT_regularizer = 0.001
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    tI_F = 1./fv * ssp.eye(F, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    if ssp.issparse(R_A):
        R_A = R_A.todense()  # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")
    
    s.fill(0)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        diff_m_xa = (means-X.dot(A.T))
        
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += diff_m_xa.T.dot(diff_m_xa)
        sigT.flat[::K+1] += varcs.sum(axis=0)
        sigT /= (P+F+D)
        sigT.flat[::K+1] += sigT_regularizer
        
        # Diagonalize it
        sigT = np.diag(sigT.flat[::K+1])
        # and invert it.
        isigT = np.diag(np.reciprocal(sigT.flat[::K+1]))
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        S = expMeans * R.dot(vocab.T)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Finally update the parameter V
        V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # And now this is the E-Step, though it's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the mapping from the features to topics
        A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Means
        vMat   = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this
        lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] *  lxi)  # inverse of D diagonal matrices...
        means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent
                                # do doing a hadamard product for all d
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Variances
        varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # Follow Bouchard's suggested approach of fixing it at zero
        #
#         s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
#         debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, lxi, s, n)
            
            boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX)
            likeValues[bvIdx]  = log_likelihood(data, modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(likeValues[bvIdx], n.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f  bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#             print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvment in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 50:
                lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx)
                    return modelState, queryState, (boundIters, boundValues, likeValues)
            bvIdx += 1


    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, lxi, s, n), \
        (boundIters, boundValues, likeValues)
Ejemplo n.º 8
0
def train(modelState, X, W, iterations=10000, epsilon=0.001, logInterval = 0):
    '''
    Creates a new query state object for a topic model based on side-information. 
    This contains all those estimated parameters that are specific to the actual
    date being queried - this must be used in conjunction with a model state.
    
    The parameters are
    
    modelState - the model state with all the model parameters
    X - the D x F matrix of side information vectors
    W - the D x V matrix of word **count** vectors.
    iterations - how long to iterate for
    epsilon - currently ignored, in future, allows us to stop early.
    
    This returns a tuple of new model-state and query-state. The latter object will
    contain X and W and also
    
    s      - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk 
    lxi    - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the 
             quadratic term xi
    lambda - the topics we've inferred for the current batch of documents
    nu     - the variance of topics we've inferred (independent)
    '''
    # Unpack the model state tuple for ease of use and maybe speed improvements
    (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab)
       
    # Get ready to plot the evolution of the likelihood
    if logInterval > 0:
        elbos = np.zeros((iterations / logInterval,))
        iters = np.zeros((iterations / logInterval,))
    
    # We'll need the total word count per doc, and total count of docs
    docLen = W.sum(axis=1)
    D      = len(docLen)
    
    # No need to recompute this every time
    XTX = X.T.dot(X)
    
    # Assign initial values to the query parameters
    lmda = rd.random((D, K))
    nu   = np.ones((D,K), np.float64)
    s    = np.zeros((D,))
    lxi  = negJakkola (np.ones((D, K), np.float64))
    
    XA = X.dot(A)
    for iteration in range(iterations):
        
        # Save repeated computation
        tsq      = tau * tau;
        tsqIP    = tsq * np.eye(P)
        trTsqIK  = K * tsq # trace of the matrix tau * tau * np.eye(K)
        halfSig2 = 1./(sigma*sigma)
        tau2sig2 = (tau * tau) / (sigma * sigma)
        
        # =============================================================
        # E-Step
        #   Model dists are q(Theta|A;Lambda;nu) q(A|V) q(V)
        #   Where lambda is the posterior mean of theta.
        # =============================================================
        
        #
        # V, varV
        varV = la.inv (tsqIP + U.T.dot(U))
        V    = varV.dot(U.T).dot(A)
        _quickPrintElbo ("E-Step: q(V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # A, varA
        # TODO, since only tau2sig2 changes at each step, would it be possible just to
        # amend the old inverse?
        # TODO Use sparse inverse
        varA = la.inv (tau2sig2 * XTX + np.eye(F))
        A    = varA.dot (U.dot(V) + X.T.dot(lmda))
        XA   = X.dot(A)
        _quickPrintElbo ("E-Step: q(A|V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
       
        #
        # lmda_dk
        lnVocab = safe_log (vocab)
        Z   = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxT
        rho = 2 * s[:,np.newaxis] * lxi - 0.5 \
            + np.einsum('dt,dkt->dk', W, Z) / docLen[:,np.newaxis]
        
        rhs  = docLen[:,np.newaxis] * rho + halfSig2 * X.dot(A)
        lmda = rhs / (docLen[:,np.newaxis] * 2 * lxi + halfSig2)
        
        _quickPrintElbo ("E-Step: q(Theta|A;lamda)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
              
        
        #
        # nu_dk
        # TODO Double check this again...
        nu = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + halfSig2)

        _quickPrintElbo ("E-Step: q(Theta|A;nu)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        # =============================================================
        # M-Step
        #    Parameters for the softmax bound: lxi and s
        #    The projection used for A: U
        #    The vocabulary : vocab
        #    The variances: tau, sigma
        # =============================================================
        
        #
        # s_d
#         s = (K/4. + (lxi * lmda).sum(axis = 1)) / lxi.sum(axis=1)
#         _quickPrintElbo ("M-Step: max s", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        

        #
        # xi_dk
        lxi = negJakkolaOfDerivedXi(lmda, nu, s)
        _quickPrintElbo ("M-Step: max xi", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # vocab
        #
        # TODO, since vocab is in the RHS, is there any way to optimize this?
        Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV
        vocab = normalizerows_ip (np.einsum('dt,dkt->kt', W, Z))
        _quickPrintElbo ("M-Step: max vocab", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # U
        U = A.dot(V.T).dot (la.inv(trTsqIK * varV + V.dot(V.T)))
        _quickPrintElbo ("M-Step: max U", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # sigma
        #    Equivalent to \frac{1}{DK} \left( \sum_d (\sum_k nu_{dk}) + tr(\Omega_A) x_d^{T} \Sigma_A x_d + (\lambda - A^{T} x_d)^{T}(\lambda - A^{T} x_d) \right)
        #
#        sigma = 1./(D*K) * (np.sum(nu) + D*K * tsq * np.sum(XTX * varA) + np.sum((lmda - XA)**2))
        
        #
        # tau
        #    Equivalent to \frac{1}{KF} \left( tr(\Sigma_A)tr(\Omega_A) + tr(\Sigma_V U U^{T})tr(\Omega_V) + tr ((M_A - U M_V)^{T} (M_A - U M_V)) \right)
        #
        varA_U = varA.dot(U)
#        tau_term1 = np.trace(varA)*K*tsq
#        tau_term2 = sum(varA_U[p,:].dot(U[p,:]) for p in xrange(P)) * K * tsq
#        tau_term3 = np.sum((A - U.dot(V)) ** 2)
#        
#        tau = 1./(K*F) * (tau_term1 + tau_term2 + tau_term3)
        
        if (logInterval > 0) and (iteration % logInterval == 0):
            elbo = varBound ( \
                VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \
                VbSideTopicQueryState(lmda, nu, lxi, s, docLen),
                X, W, Z, lnVocab, varA_U, XA, XTX)
                
            elbos[iteration / logInterval] = elbo
            iters[iteration / logInterval] = iteration
            print ("Iteration %5d  ELBO %f" % (iteration, elbo))
        
    if logInterval > 0:
        plot_bound(iters, elbos)
    
    return (VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \
            VbSideTopicQueryState (lmda, nu, lxi, s, docLen))
Ejemplo n.º 9
0
    def _testOnModelHandcraftedData(self):
        #
        # Create the vocab
        #
        T = 3 * 3
        K = 5

        # Horizontal bars
        vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])),
                                shape=(3, 3)).todense()
        #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense()
        vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])),
                                shape=(3, 3)).todense()

        # Vertical bars
        vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])),
                                shape=(3, 3)).todense()
        #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense()
        vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])),
                                shape=(3, 3)).todense()

        # Diagonals
        vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])),
                                shape=(3, 3)).todense()
        #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense()

        # Put together
        T = vocab1.shape[0] * vocab1.shape[1]
        vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7]

        # Create a single matrix with the flattened vocabularies
        vocabVectors = []
        for vocab in vocabs:
            vocabVectors.append(np.squeeze(np.asarray(vocab.reshape((1, T)))))

        vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE))

        # Plot the vocab
        ones = np.ones(vocabs[0].shape)
        for k in range(K):
            plt.subplot(2, 3, k)
            plt.imshow(ones - vocabs[k], interpolation="none", cmap=cm.Greys_r)
        plt.show()

        #
        # Create the corpus
        #
        rd.seed(0xC0FFEE)
        D = 1000

        # Make sense (of a sort) of this by assuming that these correspond to
        # Kittens    Omelettes    Puppies    Oranges    Tomatoes    Dutch People    Basketball    Football
        #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25])
        #        topicCovar = np.array(\
        #            [[ 100,    5,     55,      20,     5,     15,      4,      0], \
        #             [ 5,    100,      5,      10,    70,      5,      0,      0], \
        #             [ 55,     5,    100,       5,     5,     10,      0,      5], \
        #             [ 20,    10,      5,     100,    30,     30,     20,     10], \
        #             [ 5,     70,      5,     30,    100,      0,      0,      0], \
        #             [ 15,     5,     10,     30,      0,    100,     10,     40], \
        #             [ 4,      0,      0,     20,      0,     10,    100,     20], \
        #             [ 0,      0,      5,     10,      0,     40,     20,    100]], dtype=DTYPE) / 100.0

        topicMean = np.array([25, 15, 40, 5, 15])
        self.assertEqual(100, topicMean.sum())
        topicCovar = np.array(\
            [[ 100,    5,     55,      20,     5     ], \
             [ 5,    100,      5,      10,    70     ], \
             [ 55,     5,    100,       5,     5     ], \
             [ 20,    10,      5,     100,    30     ], \
             [ 5,     70,      5,     30,    100     ], \
             ], dtype=DTYPE) / 100.0

        meanWordCount = 80
        wordCounts = rd.poisson(meanWordCount, size=D)
        topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D)
        W = topicDists.dot(vocab) * wordCounts[:, np.newaxis]
        W = ssp.csr_matrix(W.astype(DTYPE))

        #
        # Train the model
        #
        model = ctm.newModelAtRandom(W, K, dtype=DTYPE)
        queryState = ctm.newQueryState(W, model)
        trainPlan = ctm.newTrainPlan(iterations=65, logFrequency=1)

        self.assertTrue(0.99 < np.sum(model.topicMean) < 1.01)

        return self._doTest(W, model, queryState, trainPlan)
Ejemplo n.º 10
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype

    # Book-keeping for logs
    boundIters, boundValues, boundLikes = [], [], []
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0

    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG
                         )  # sort needs to be stable in order to be reversible
    W = W[sortIdx, :]  # deep sorted copy
    X = X[sortIdx, :]
    means, varcs = means[sortIdx, :], varcs[sortIdx, :]

    docLens = originalDocLens[sortIdx]

    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])

    # Initialize some working variables
    R = W.copy()

    aI_P = 1. / lfv * ssp.eye(P, dtype=dtype)

    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    leastSquares = lambda feats, targets: la.lstsq(
        feats, targets, lapack_driver="gelsy")[0].T
    if ssp.issparse(
            R_A):  # dense inverse typically as fast or faster than sparse
        R_A = to_dense_array(
            R_A)  # inverse and the result is usually dense in any case
        leastSquares = lambda feats, targets: np.array(
            [ssp.linalg.lsqr(feats, targets[:, k])[0] for k in range(K)])
    R_A.flat[::F + 1] += 1. / fv
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")

    priorSigt_diag = np.ndarray(shape=(K, ), dtype=dtype)
    priorSigt_diag.fill(0.001)

    # Iterate over parameters
    for itr in range(iterations):
        A = leastSquares(X, means)
        diff_a_yv = (A - Y.dot(V))

        for _ in range(10):  #(50 if itr == 0 else 1):
            # Update the covariance of the prior
            diff_m_xa = (means - X.dot(A.T))

            sigT = 1. / lfv * (Y.dot(Y.T))
            sigT += 1. / fv * diff_a_yv.dot(diff_a_yv.T)
            sigT += diff_m_xa.T.dot(diff_m_xa)
            sigT.flat[::K + 1] += varcs.sum(axis=0)

            # As small numbers lead to instable inverse estimates, we use the
            # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
            # scales whenever we use the inverse of the unscaled covariance
            sigScale = 1. / (P + D + F)
            isigScale = 1. / sigScale

            isigT = la.inv(sigT)
            debugFn(itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y,
                    lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                    docLens)

            # Update the vocabulary
            vocab *= (
                R.T.dot(expMeans)
            ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
            vocab += vocabPrior
            vocab = normalizerows_ip(vocab)

            # Reset the means to their original form, and log effect of vocab update
            R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
            S = expMeans * R.dot(vocab.T)
            debugFn(itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            # Update the Variances
            varcs = 1. / ((docLens * (K - 1.) / K)[:, np.newaxis] +
                          isigScale * isigT.flat[::K + 1])
            debugFn(itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            # Update the Means
            rhs = X.dot(A.T).dot(isigT) * isigScale
            rhs += S
            rhs += docLens[:, np.newaxis] * means.dot(Ab)
            rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)

            # Faster version?
            for lenIdx in range(len(lens)):
                nd = lens[lenIdx]
                start, end = inds[lenIdx], inds[lenIdx + 1]
                lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale

                means[start:end, :] = rhs[start:end, :].dot(
                    lhs
                )  # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

    #       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
            debugFn(itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                              out=expMeans)

        # for _ in range(150):
        #     # Finally update the parameter V
        #     V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        #     debugFn(itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)
        #
        #     # Update the distribution on the latent space
        #     R_Y_base = aI_P + 1 / fv * V.dot(V.T)
        #     R_Y = la.inv(R_Y_base)
        #     debugFn(itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype,
        #             means, varcs, Ab, docLens)
        #
        #     Y = 1. / fv * A.dot(V.T).dot(R_Y)
        #     debugFn(itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)
        #
        #     # Update the mapping from the features to topics
        #     A = (1. / fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        #     debugFn(itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V,
                                    sigT * sigScale, vocab, vocabPrior, Ab,
                                    dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues.append(
                var_bound(DataSet(W, feats=X), modelState, queryState, XTX))
            boundLikes.append(
                log_likelihood(DataSet(W, feats=X), modelState, queryState))
            boundIters.append(itr)
            perp = perplexity_from_like(boundLikes[-1], docLens.sum())
            print(
                time.strftime('%X') +
                " : Iteration %d: Perplexity %4.0f bound %f" %
                (itr, perp, boundValues[-1]))
            if len(boundIters) >= 2 and boundValues[-2] > boundValues[-1]:
                printStderr("ERROR: bound degradation: %f > %f" %
                            (boundValues[-2], boundValues[-1]))


#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

# Check to see if the improvement in the likelihood has fallen below the threshold
            if len(boundIters) > 2 and boundIters[-1] > 20:
                lastPerp = perplexity_from_like(boundLikes[-2], docLens.sum())
                if lastPerp - perp < 1:
                    break

    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means = means[revert_sort, :]
    varcs = varcs[revert_sort, :]
    docLens = docLens[revert_sort]

    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    boundLikes = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0
    
    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible
    W = W[sortIdx,:] # deep sorted copy
    X = X[sortIdx,:]
    means, varcs = means[sortIdx,:], varcs[sortIdx,:]

    docLens = originalDocLens[sortIdx]
    
    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])
    
    # Initialize some working variables
    R = W.copy()
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    R_A = R_A.todense()      # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")


    diff_m_xa = (means-X.dot(A.T))
    means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa)

    expMeans = np.zeros((BatchSize, K), dtype=dtype)
    R = np.zeros((BatchSize, K), dtype=dtype)
    S = np.zeros((BatchSize, K), dtype=dtype)
    vocabScale = np.ones(vocab.shape, dtype=dtype)
    
    # Iterate over parameters
    batchIter = 0
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += means_cov_with_x_a
        sigT.flat[::K+1] += varcs.sum(axis=0)

        # As small numbers lead to instable inverse estimates, we use the
        # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
        # scales whenever we use the inverse of the unscaled covariance
        sigScale  = 1. / (P+D+F)
        isigScale = 1. / sigScale

        isigT = la.inv(sigT)
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the vocabulary
        vocab *= vocabScale
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Finally update the parameter V
        V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        
        #
        # And now this is the E-Step
        # 
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the mapping from the features to topics
        A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the Variances
        varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)

        # Faster version?
        vocabScale[:,:] = 0
        means_cov_with_x_a[:,:] = 0
        for lenIdx in range(len(lens)):
            nd         = lens[lenIdx]
            start, end = inds[lenIdx], inds[lenIdx + 1]
            lhs        = la.inv(isigT + sigScale * nd * Ab) * sigScale

            for d in range(start, end, BatchSize):
                end_d = min(d + BatchSize, end)
                span  = end_d - d

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab)
                S[:span,:] = expMeans[:span, :] * R.dot(vocab.T)

                # Convert expMeans to a softmax(means)
                expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis]

                mu   = X[d:end_d,:].dot(A.T)
                rhs  = mu.dot(isigT) * isigScale
                rhs += S[:span,:]
                rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab)
                rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means)

                means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R)

                stepSize = (Tau + batchIter) ** -Kappa
                batchIter += 1

                # Do a gradient update of the vocab
                vocabScale += (R.T.dot(expMeans[:span,:])).T
                # vocabScale *= vocab
                # normalizerows_ip(vocabScale)
                # # vocabScale += vocabPrior
                # vocabScale *= stepSize
                # vocab *= (1 - stepSize)
                # vocab += vocabScale

                diff = (means[d:end_d,:] - mu)
                means_cov_with_x_a += diff.T.dot(diff)

#       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX)
            boundLikes[bvIdx]  = log_likelihood(DataSet(W, feats=X), modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvement in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 20:
                lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx)
                    break
            bvIdx += 1
        
    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means       = means[revert_sort,:]
    varcs       = varcs[revert_sort,:]
    docLens     = docLens[revert_sort]
    
    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
Ejemplo n.º 12
0
def train(modelState, X, W, plan):
    '''
    Creates a new query state object for a topic model based on side-information. 
    This contains all those estimated parameters that are specific to the actual
    date being queried - this must be used in conjunction with a model state.
    
    The parameters are
    
    modelState - the model state with all the model parameters
    X          - the D x F matrix of side information vectors
    W          - the D x V matrix of word **count** vectors.
    iterations - how long to iterate for
    epsilon    - currently ignored, in future, allows us to stop early.
    logInterval  - the interval between iterations where we calculate and display
                   the log-likelihood bound
    plotInterval - the interval between iterations we we display the log-likelihood
                   bound values calcuated at each log-interval
    fastButInaccurate - if true, we may use a psedo-inverse instead of an inverse
                        when solving for Y when the true inverse is unavailable.
    
    This returns a tuple of new model-state and query-state. The latter object will
    contain X and W and also
    
    s      - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk 
    lxi    - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the 
             quadratic term xi
    lambda - the topics we've inferred for the current batch of documents
    nu     - the variance of topics we've inferred (independent)
    '''
    # Unpack the model state tuple for ease of use and maybe speed improvements
    K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar
    iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate

    mu0 = 0.0001

    if W.dtype.kind == 'i':  # for the sparseScalorQuotientOfDot() method to work
        W = W.astype(DTYPE)

    # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...)
    if logCount > 0:
        multiStepSize = np.power(iterations, 1. / logCount)
        logIter = 1
        elbos = []
        likes = []
        iters = []
    else:
        logIter = iterations + 1
    lastVarBoundValue = -sys.float_info.max

    # We'll need the total word count per doc, and total count of docs
    docLen = np.squeeze(
        np.asarray(W.sum(axis=1))
    )  # Force to a one-dimensional array for np.newaxis trick to work
    D = len(docLen)

    # No need to recompute this every time
    if X.dtype != DTYPE:
        X = X.astype(DTYPE)
    XTX = X.T.dot(X)

    # Identity matrices that occur
    I_P = ssp.eye(P, P, 0, DTYPE)
    I_Q = ssp.eye(Q, Q, 0, DTYPE)
    I_QP = ssp.eye(Q * P, Q * P, 0, DTYPE)
    I_F = ssp.eye(
        F, F, 0, DTYPE, "csc"
    )  # X is CSR, XTX is consequently CSC, sparse inverse requires CSC
    T_QP = sp_vec_trans_matrix(Y.shape)

    # Assign initial values to the query parameters
    expLmda = np.exp(rd.random((D, K)).astype(DTYPE))
    nu = np.ones((D, K), DTYPE)
    s = np.zeros((D, ), DTYPE)
    lxi = negJakkola(np.ones((D, K), DTYPE))

    # If we don't bother optimising either tau or sigma we can just do all this here once only
    tsq = tauSq
    ssq = sigmaSq
    overTsq = 1. / tsq
    overSsq = 1. / ssq
    overTsqSsq = 1. / (tsq * ssq)

    # TODO the inverse being almost always dense means that it might
    # be faster to convert to dense and use the normal solver, despite
    # the size constraints.
    #    varA = 1./K * sla.inv (overTsq * I_F + overSsq * XTX)
    tI_sXTX = (overTsq * I_F + overSsq * XTX).todense()
    omA = la.inv(tI_sXTX)
    scaledWordCounts = W.copy()

    for iteration in range(iterations):

        # =============================================================
        # E-Step
        #   Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z)....
        #   Where lambda is the posterior mean of theta.
        # =============================================================

        # Y, sigY, omY
        #
        # If U'U is invertible, use inverse to convert Y to a Sylvester eqn
        # which has a much, much faster solver. Recall update for Y is of the form
        #   Y + AYB = C where A = U'U, B = V'V and C=U'AV
        #
        VTV = V.T.dot(V)
        UTU = U.T.dot(U)

        sigy = la.inv(I_QP + overTsqSsq * np.kron(VTV, UTU))
        _quickPrintElbo("E-Step: q(Y) [sigY]", iteration, X, W, K, Q, F, P, T,
                        A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma,
                        expLmda, nu, lxi, s, docLen)

        Y = mu0 + np.reshape(overTsqSsq * sigy.dot(vec(U.T.dot(A).dot(V))),
                             (Q, P),
                             order='F')
        _quickPrintElbo("E-Step: q(Y) [Mean]", iteration, X, W, K, Q, F, P, T,
                        A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma,
                        expLmda, nu, lxi, s, docLen)

        # A
        #
        # So it's normally A = (UYV' + L'X) omA with omA = inv(t*I_F + s*XTX)
        #   so A inv(omA) = UYV' + L'X
        #   so inv(omA)' A' = VY'U' + X'L
        # at which point we can use a built-in solve
        #
        #       A = (overTsq * U.dot(Y).dot(V.T) + X.T.dot(expLmda).T).dot(omA)
        lmda = np.log(expLmda, out=expLmda)
        A = la.solve(tI_sXTX, X.T.dot(lmda) + V.dot(Y.T).dot(U.T)).T
        np.exp(expLmda, out=expLmda)
        _quickPrintElbo("E-Step: q(A)", iteration, X, W, K, Q, F, P, T, A, omA,
                        Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda,
                        nu, lxi, s, docLen)

        # lmda_dk, nu_dk, s_d, and xi_dk
        #
        XAT = X.dot(A.T)
        query (VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma), \
               X, W, \
               VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \
               scaledWordCounts=scaledWordCounts, \
               XAT = XAT, \
               iterations=10, \
               logInterval = 0, plotInterval = 0)

        # =============================================================
        # M-Step
        #    Parameters for the softmax bound: lxi and s
        #    The projection used for A: U and V
        #    The vocabulary : vocab
        #    The variances: tau, sigma
        # =============================================================

        # U
        #
        try:
            U = A.dot(V).dot(Y.T).dot (la.inv( \
                    Y.dot(V.T).dot(V).dot(Y.T) \
                    + (vec_transpose_csr(T_QP, P).T.dot(np.kron(I_QP, VTV)).dot(vec_transpose(T_QP.dot(sigy), P))).T
            ))
        except np.linalg.linalg.LinAlgError as e:
            print(str(e))
            print("Ruh-ro")

        # order of last line above reversed to handle numpy bug preventing dot product from dense to sparse
        _quickPrintElbo("M-Step: U", iteration, X, W, K, Q, F, P, T, A, omA, Y,
                        omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu,
                        lxi, s, docLen)

        # V
        #
        # Temporarily this requires that we re-order sigY until I've implemented a fortran order
        # vec transpose in Cython
        sigY = sigY.T.copy()
        V = A.T.dot(U).dot(Y).dot (la.inv ( \
            Y.T.dot(U.T).dot(U).dot(Y) \
            + vec_transpose (sigY, Q).T.dot(np.kron(I_QP, UTU).dot(vec_transpose(I_QP, Q))) \
        ))
        _quickPrintElbo("M-Step: V", iteration, X, W, K, Q, F, P, T, A, omA, Y,
                        omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu,
                        lxi, s, docLen)

        # vocab
        #
        factor = (scaledWordCounts.T.dot(expLmda)
                  ).T  # Gets materialized as a dense matrix...
        vocab *= factor
        normalizerows_ip(vocab)
        _quickPrintElbo("M-Step: \u03A6", iteration, X, W, K, Q, F, P, T, A,
                        omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma,
                        expLmda, nu, lxi, s, docLen)

        # =============================================================
        # Handle logging of variational bound, likelihood, etc.
        # =============================================================
        if iteration == logIter:
            modelState = VbSideTopicModelState(K, Q, F, P, T, A, omA, Y, omY,
                                               sigY, sigT, U, V, vocab,
                                               sigmaSq, alphaSq, kappaSq,
                                               tauSq)
            queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen)

            elbo = varBound(modelState, queryState, X, W, None, XAT, XTX)
            likely = log_likelihood(
                modelState, X, W,
                queryState)  #recons_error(modelState, X, W, queryState)

            elbos.append(elbo)
            iters.append(iteration)
            likes.append(likely)
            print("Iteration %5d  ELBO %15f   Log-Likelihood %15f" %
                  (iteration, elbo, likely))

            logIter = min(np.ceil(logIter * multiStepSize), iterations - 1)

            if elbo - lastVarBoundValue < epsilon:
                break
            else:
                lastVarBoundValue = elbo

            if plot and plotIncremental:
                plot_bound(plotFile + "-iter-" + str(iteration),
                           np.array(iters), np.array(elbos), np.array(likes))

    # Right before we end, plot the evolution of the bound and likelihood
    # if we've been asked to do so.
    if plot:
        plot_bound(plotFile, iters, elbos, likes)


    return VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, U, V, vocab, tau, sigma), \
           VbSideTopicQueryState (expLmda, nu, lxi, s, docLen)
Ejemplo n.º 13
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(
        data.links.T), data.feats
    D, _ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens
    K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    if debug:
        debugFn = _debug_with_bound

        initLikely = log_likelihood(data, modelState, queryState)
        initPerp = perplexity_from_like(initLikely, data.word_count)
        print("Initial perplexity is: %.2f" % initPerp)
    else:
        debugFn = _debug_with_nothing

    # Initialize some working variables
    W_weight = W.copy()
    L_weight = L.copy()
    LT_weight = LT.copy()

    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K, ), dtype=dtype)
    priorSigT_diag.fill(NIW_PSI)

    # Iterate over parameters
    for itr in range(iterations):

        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn(itr, topicMean, "topicMean", data, K, topicMean, topicCov,
                vocab, dtype, means, varcs, A, docLens)

        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis, :]
            topicCov = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            topicCov /= (D + pseudoObsVar - K)
        else:
            topicCov = np.cov(
                means.T) if topicCov.dtype == np.float64 else np.cov(
                    means.T).astype(dtype)
            topicCov += np.diag(varcs.mean(axis=0))

        if diagonalPriorCov:
            diag = np.diag(topicCov)
            topicCov = np.diag(diag)
            itopicCov = np.diag(1. / diag)
        else:
            itopicCov = la.inv(topicCov)

        debugFn(itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)
        #        print("                topicCov.det = " + str(la.det(topicCov)))

        # Building Blocks - temporarily replaces means with exp(means)
        expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :])
        lse_at_k = np.sum(expMeansCol, axis=0)
        F = 0.5 * means \
          - (1. / (2*D + 2)) * means.sum(axis=0) \
          - expMeansCol / lse_at_k[np.newaxis, :]

        expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis])
        W_weight = sparseScalarQuotientOfDot(W,
                                             expMeansRow,
                                             vocab,
                                             out=W_weight)

        # Update the vocabularies

        vocab *= (
            W_weight.T.dot(expMeansRow)
        ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += VocabPrior
        vocab = normalizerows_ip(vocab)

        docVocab = (
            expMeansCol /
            lse_at_k[np.newaxis, :]).T  # FIXME Dupes line in definitino of F

        # Recalculate w_top_sums with the new vocab and log vocab improvement
        W_weight = sparseScalarQuotientOfDot(W,
                                             expMeansRow,
                                             vocab,
                                             out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        debugFn(itr, vocab, "vocab", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        # Now do likewise for the links, do it twice to model in-counts (first) and
        # out-counts (Second). The difference is the transpose
        LT_weight = sparseScalarQuotientOfDot(LT,
                                              expMeansRow,
                                              docVocab,
                                              out=LT_weight)
        l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow
        in_counts = l_intop_sums.sum(axis=0)

        L_weight = sparseScalarQuotientOfDot(L,
                                             expMeansRow,
                                             docVocab,
                                             out=L_weight)
        l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow

        # Reset the means and use them to calculate the weighted sum of means
        meanSum = means.sum(axis=0) * in_counts

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1}
        varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1. / K) +
                              np.diagonal(topicCov))
        debugFn(itr, varcs, "varcs", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        # Update the Means
        rhs = w_top_sums.copy()
        rhs += l_intop_sums
        rhs += l_outtop_sums
        rhs += itopicCov.dot(topicMean)
        rhs += emit_counts[:, np.newaxis] * (means.dot(A) -
                                             rowwise_softmax(means))
        rhs += in_counts[np.newaxis, :] * F
        if diagonalPriorCov:
            raise ValueError("Not implemented")
        else:
            for d in range(D):
                rhs_ = rhs[d, :] + (1. /
                                    (4 * D + 4)) * (meanSum -
                                                    in_counts * means[d, :])
                means[d, :] = la.inv(itopicCov + emit_counts[d] * A +
                                     np.diag(D * in_counts /
                                             (2 * D + 2))).dot(rhs_)
                if np.any(np.isnan(means[d, :])) or np.any(
                        np.isinf(means[d, :])):
                    pass

                if np.any(np.isnan(
                        np.exp(means[d, :] - means[d, :].max()))) or np.any(
                            np.isinf(np.exp(means[d, :] - means[d, :].max()))):
                    pass

        debugFn(itr, means, "means", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype,
                                    MODEL_NAME)
            queryState = QueryState(means, varcs, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime('%X') +
                " : Iteration %d: bound %f \t Perplexity: %.2f" %
                (itr, boundValues[-1],
                 perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    printStderr("ERROR: bound degradation: %f > %f" %
                                (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if False and itr > 100 and abs(
                        perplexity_from_like(likelyValues[-1], docLens.sum()) -
                        perplexity_from_like(likelyValues[-2], docLens.sum())
                ) < 1.0:
                    break


    return \
        ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \
        QueryState(means, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))