コード例 #1
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      - 

    Return:
    The updated model object (note parameters are updated in place, so make a 
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug           
    W_list, docLens, topicDists = \
        queryState.W_list, queryState.docLens, queryState.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype

    W   = data.words
    D,T = W.shape
    
    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    
    # Book-keeping for logs
    logPoints    = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters   = np.zeros(shape=(logPoints,))
    boundValues  = np.zeros(shape=(logPoints,))
    likelyValues = np.zeros(shape=(logPoints,))
    bvIdx = 0
    
    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part. 
    z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F')
 
    # Select the training iterations function appropriate for the dtype
    current_micro_time = lambda: int(time.time())
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64
#    do_iterations = iterate # pure Python
    
    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters  = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    totalItrs = 0
    for segment in range(logPoints - 1):
        start = current_micro_time()
        totalItrs += do_iterations (segIters, D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)
        
        duration = current_micro_time() - start
    
        boundIters[bvIdx]   = segment * segIters
        boundValues[bvIdx]  = var_bound(data, modelState, queryState)
        likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)
        perp = perplexity_from_like(likelyValues[bvIdx], W.sum())
        bvIdx += 1
        
        if converged (boundIters, boundValues, bvIdx, epsilon, minIters=20):
            boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx)
            return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
                QueryState(W_list, docLens, topicDists), \
                (boundIters, boundValues, likelyValues)
        
        print ("Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f" % (segment, logPoints, totalItrs, duration, perp, boundValues[bvIdx - 1], likelyValues[bvIdx - 1]))
    
    # Final batch of iterations.
    do_iterations (remainder, D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)
    
    boundIters[bvIdx]   = iterations - 1
    boundValues[bvIdx]  = var_bound(data, modelState, queryState)
    likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)
   
            
    return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, topicDists), \
           (boundIters, boundValues, likelyValues)
コード例 #2
0
def train (data, modelState, queryState, trainPlan, query=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      - 

    Return:
    The updated model object (note parameters are updated in place, so make a 
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug           
    W_list, docLens, q_n_dk, q_n_kt, q_n_k, z_dnk = \
        queryState.W_list, queryState.docLens, queryState.n_dk, queryState.n_kt, queryState.n_k, queryState.z_dnk
    K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.n_dk, modelState.n_kt, modelState.n_k
    
    D_train = 0 if m_n_dk is None else m_n_dk.shape[0]
    D_query = q_n_dk.shape[0]
    W = data.words
    T = W.shape[1]
    
    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    
    # Book-keeping for logs
    logPoints    = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters   = np.zeros(shape=(logPoints,))
    boundValues  = np.zeros(shape=(logPoints,))
    likelyValues = np.zeros(shape=(logPoints,))
    bvIdx = 0
    
    # Early stopping check
    finishedTraining = False
    
    # Add the model counts (essentially the learnt model parameters) to those for
    # the query, assuming the model has been trained previously
    if m_n_dk is not None:
        np.add (q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt += m_n_kt
        np.add (q_n_k,  m_n_k,  out=q_n_k)  # q_n_k  += m_n_k
    
#     print ("Topic prior : " + str(topicPrior))
    
    # Select the training iterations function appropriate for the dtype
    if debug: print ("Starting Training")
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64
    
    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters  = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    for segment in range(logPoints - 1):
        do_iterations (segIters, D_query, D_train, K, T, \
                       W_list, docLens, \
                       q_n_dk, q_n_kt, q_n_k, z_dnk,\
                       topicPrior, vocabPrior)

        
        # Measure and record the improvement to the bound and log-likely
        boundIters[bvIdx]   = segment * segIters
        boundValues[bvIdx]  = var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k)
        likelyValues[bvIdx] = log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k)
        bvIdx += 1
        
        # Check to see if the improvement in the bound has fallen below the threshold
        if converged (boundIters, boundValues, bvIdx, epsilon, minIters=20):
            boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx)
            finishedTraining = True
            break
        
        if debug: print ("Segment %d/%d Total Iterations %d Duration %d" % (segment, logPoints, -1, -1))
    
    # Final scheduled batch of iterations if we haven't already converged.
    if not finishedTraining:
        do_iterations (remainder, D_query, D_train, K, T, \
                   W_list, docLens, \
                   q_n_dk, q_n_kt, q_n_k, z_dnk,\
                   topicPrior, vocabPrior)
    
        boundIters[bvIdx]   = iterations - 1
        boundValues[bvIdx]  = var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k)
        likelyValues[bvIdx] = log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k)
        
    # Now return the results
    if query: # Model is unchanged, query is changed
        if m_n_dk is not None:
            np.subtract(q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt -= m_n_kt
            np.subtract(q_n_k,  m_n_k,  out=q_n_k)  # q_n_k  -= m_n_k
    else: # train # Model is changed (or flat-out created). Query is changed
        if m_n_dk is not None: # Amend existing
            m_n_dk = np.vstack((m_n_dk, q_n_dk))
            m_n_kt[:,:] = q_n_kt
            m_n_k[:]    = q_n_k
        else:                  # Create from scratch
            m_n_dk = q_n_dk.copy()
            m_n_kt = q_n_kt.copy()
            m_n_k  = q_n_k.copy()
            
    return ModelState(K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, q_n_dk, q_n_kt, q_n_k, z_dnk), \
           (boundIters, boundValues, likelyValues)
コード例 #3
0
ファイル: rtm.py プロジェクト: budgefeeney/sidetopics
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint, and additionally learns the weights
    needed to predict new links.

    Params:
    W - the DxT document-term matrix
    X - The DxD document-document matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, weights, negCount, reg, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W   = data.words
    D,T = W.shape
    X   = data.links

    iters, bnds, likes = [], [], []

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    z = np.empty((K,), dtype=dtype, order='F')
    diWordDistSums = np.empty((K,), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)

    for itr in range(iterations):
        if debug: printAndFlushNoNewLine("\n %4d: " % itr)

        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists,      out=diWordDists)

        if updateVocab:
            # Perform inference, updating the vocab
            wordDists[:, :] = vocabPrior
            for d in range(D):
                if debug and d % 100 == 0: printAndFlushNoNewLine(".")
                wordIdx, z = _update_topics_at_d(d, data, weights, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums)
                wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

            _infer_weights(data, weights, topicMeans, topicPrior, negCount, reg)

            # Log bound and the determine if we can stop early
            if itr % logFrequency == 0:
                iters.append(itr)
                bnds.append(_var_bound_internal(data, model, query))
                likes.append(_log_likelihood_internal(data, model, query))

                if debug: print("%.3f < %.3f" % (bnds[-1], likes[-1]))
                if converged(iters, bnds, len(bnds) - 1, minIters=5):
                    break

            # Update hyperparameters (do this after bound, to make sure bound
            # calculation is internally consistent)
            if itr > 0 and itr % HyperParamUpdateInterval == 0:
                if debug: print("Topic Prior was " + str(topicPrior))
                _updateTopicHyperParamsFromMeans(model, query)
                if debug: print("Topic Prior is now " + str(topicPrior))
        else:
            for d in range(D):
                _ = _update_topics_at_d(d, W, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums)

    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, weights, negCount, reg, dtype, model.name), \
           QueryState(docLens, topicMeans), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
コード例 #4
0
ファイル: mtm.py プロジェクト: budgefeeney/sidetopics
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint, and additionally learns the weights
    needed to predict new links.

    Params:
    W - the DxT document-term matrix
    X - The DxD document-document matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \
        query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts
    K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \
	    model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    W, L, X_fixme = data.words, data.links, data.feats
    D,T = W.shape

    iters, bnds, likes = [], [], []

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    diWordDistSums = np.empty((K,), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)

    new_in_counts = in_counts.copy()

    newCov = np.ndarray(shape=(K, K), dtype=model.dtype)
    newCov.fill(0)
    invCov = la.inv(topicCov)
    S      = topicCov.copy()
    rhs    = np.ndarray(shape=(K,), dtype=model.dtype)
    b, f   = rhs.copy(), rhs.copy() # linear coefficients of the Bohing quadratic bounds
    new_maxes_bytop = np.ndarray(shape=(K,), dtype=model.dtype)
    maxes_bytop     = topics.max(axis=0)

    for itr in range(iterations):
        if itr % logFrequency == 0:
            iters.append(itr)
            bnds.append(_var_bound_internal(data, model, query))
            likes.append(_log_likelihood_internal(data, model, query))

            if debug: print ("Bound : %f \t Likelihood %f  \t Perplexity %.2f" % (bnds[-1], likes[-1], np.exp(-likes[-1]/docLens.sum())))
            if converged(iters, bnds, len(bnds) - 1, minIters=5):
                break
        if debug: printAndFlushNoNewLine("\n %4d: " % itr)

        newCov[:, :] = 0
        new_in_counts[:] = 0

        # U and V FIXME DEBUG
        # U[:, :] = la.lstsq(V.T, topics.T)[0].T
        # V[:, :] = la.lstsq(U, topics)[0]

        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists,      out=diWordDists)

        wordDists[:, :] = vocabPrior
        new_maxes_bytop.fill(1E-300)
        for d in range(D):
            if d % 100 == 0:
                printAndFlushNoNewLine(".")
            wordIdx, z, linkIdx, y = _update_topics_at_d(d, data, docLens, topics, topicPrior, lse_at_k, diWordDists, diWordDistSums)

            # Update the word distributions
            wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

            # Determine the topic distribution
            # Step 1, the covariance
            S[:, :] = invCov
            S[np.diag_indices_from(S)] += docLens[d] + out_counts[d]
            S[:, :] -= 1. / (K+1)
            S[np.diag_indices_from(S)] += (K - 1.) / K * in_counts
            S = la.inv(S)

            # Topics Step 2, the actual right-hand side
            rhs[:]  = invCov.dot(U[d, :].dot(V))
            rhs    += (z * W[d, :].data[np.newaxis, :]).sum(axis=1)
            ysum    = (y * L[d, :].data[:, np.newaxis]).sum(axis=0)
            #rhs    += ysum

            b[:] = topics[d, :] - 1./(K+1) * topics[d, :].sum() - softmax(topics[d, :])
            b *= docLens[d]
            rhs += b

            f[:] = topics[d, :] - 1./(D + 1) * tsums_bytop - np.exp(topics[d, :] - maxes_bytop) / exp_tsums_bytop
            f *= in_counts
            rhs += f

            rhs[:] += (D - 1)/(2 * D + 2) * (in_counts * (tsums_bytop - topics[d, :]))

            # Topics Step 3: solve
            new_topics = S.dot(rhs)

            # Topics Step 4: update the running counts and covariance, then assign the new topics to "topics'
            tsums_bytop -= topics[d, :]
            tsums_bytop += new_topics

            new_maxes_bytop = np.maximum(new_maxes_bytop, new_topics)
            new_in_counts += ysum

            vec = new_topics - U[d, :].dot(V)
            newCov += np.outer (vec, vec)
            newCov += np.diag(S)

            topics[d, :] = new_topics

            # Next step is the posterior covariance
            postTopicCov[d, :] = np.diag(S)

        # The covariance hyper-parameter
        topicCov[:, :] = newCov
        invCov[:, :]   = la.inv(topicCov)

        # The remaining running counts, and the column-wise softmax adjustment
        maxes_bytop[:] = new_maxes_bytop
        in_counts[:]   = new_in_counts
        exp_tsums_bytop[:] = np.sum(np.exp(topics - maxes_bytop[np.newaxis, :]), axis=0)


    return ModelState(K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, model.name), \
           QueryState(docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))