Ejemplo n.º 1
0
    def testCrossValPerplexityOnRealDataWithLdaGibbsInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(np.int32) # Gibbs expects integers as input, regardless of model dtype
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Training setup
        TrainSamplesPerTopic = 10
        QuerySamplesPerTopic = 2
        Thin = 2
        Debug = False

        # Start running experiments
        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPlan = lda_gibbs.newTrainPlan(K * TrainSamplesPerTopic, thin=Thin, debug=Debug)
            queryPlan = lda_gibbs.newTrainPlan(K * QuerySamplesPerTopic, thin=Thin, debug=Debug)

            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)
                estData, evalData = queryData.doc_completion_split()

                model = lda_gibbs.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda_gibbs.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda_gibbs.train (trainData, model, query, trainPlan)

                like = lda_gibbs.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                query = lda_gibbs.newQueryState(estData, model)
                _, queryResult = lda_gibbs.query(estData, model, query, queryPlan)

                like = lda_gibbs.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
Ejemplo n.º 2
0
def _debug_with_bound (itr, var_value, var_name, data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, n):
    if np.isnan(var_value).any():
        printStderr ("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr ("WARNING: " + var_name + " contains INFs")
    if "dtype" in dir(var_value) and var_value.dtype != dtype:
        printStderr ("WARNING: dtype(" + var_name + ") = " + str(var_value.dtype))

    model = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, False, dtype, MODEL_NAME)
    query = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, n)

    old_bound = _debug_with_bound.old_bound
    bound     = var_bound(data, model, query)
    diff = "" if old_bound == 0 else "%15.4f" % (bound - old_bound)
    _debug_with_bound.old_bound = bound
    
    addendum = ""
    if var_name == "topicCov":
        try:
            addendum = "log det(topicCov) = %g" % (np.log(la.det(topicCov)))
        except:
            addendum = "log det(topicCov) = <undefined>"
    
    if isnan(bound):
        printStderr ("Bound is NaN")
    else:
        perp = perplexity_from_like(log_likelihood(data, model, query), data.word_count)
        if int(bound - old_bound) < 0:
            printStderr ("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f)     %s" % (itr, var_name, bound, diff, perp, addendum))
        else:
            print ("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f)  %s" % (itr, var_name, bound, diff, perp, addendum))
Ejemplo n.º 3
0
def _debug_with_bound (itr, var_value, var_name, W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n):
    if np.isnan(var_value).any():
        printStderr ("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr ("WARNING: " + var_name + " contains INFs")
    global last
    
    addendum = ""
    if var_name == "sigT":
        try:
            addendum = "det(sigT) = %g" % (la.det(sigT))
        except:
            addendum = "det(sigT) = <undefined>"

    model, query =  ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME), QueryState(means, means.copy(), varcs, lxi, s, n)
    perp  = perplexity_from_like(log_likelihood(DataSet(W), model, query), W.sum())
    bound = var_bound(DataSet(W), model, query)
    dif = 0 if last == 0 else last - bound
    if dif > 0:
        sys.stdout.flush()
        sys.stderr.flush()
        sys.stderr.write("Iter %3d Update %10s Perp %4.2f   Bound %.3f (%+.3f)     %s\n" % (itr, var_name, perp, bound, dif, addendum))
        sys.stderr.flush()
    else:
        print ("Iter %3d Update %10s Perp %4.2f   Bound %.3f (%+.3f)     %s" % (itr, var_name, perp, bound, dif, addendum))
    last = bound
Ejemplo n.º 4
0
    def testPerplexityOnRealData(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        K = 50
        model      = mtm.newModelAtRandom(data, K, K - 1, dtype=dtype)
        queryState = mtm.newQueryState(data, model)
        trainPlan  = mtm.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=True)

        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = mtm.train (data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')

        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')

        fig.show()
        plt.show()

        fig, ax1 = plt.subplots()
        ax1.imshow(model.topicCov, interpolation="nearest", cmap=cm.Greys_r)
        fig.show()
        plt.show()

        # Print out the most likely topic words
        # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
        vocab = mtm.wordDists(model)
        topWordCount = 10
        kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

        like = mtm.log_likelihood(data, model, query)
        perp = perplexity_from_like(like, data.word_count)

        print ("Prior %s" % (str(model.topicPrior)))
        print ("Perplexity: %f\n\n" % perp)

        for k in range(model.K):
            print("\nTopic %d\n=============================" % k)
            print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
Ejemplo n.º 5
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    W = data.words
    D = W.shape[0]
    
    # Necessary temp variables (notably the count of topic to word assignments
    # per topic per doc)
    isigT = la.inv(sigT)
    
    # Update the Variances
    varcs = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1])
    debugFn (0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n)
    
    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    R = W.copy()
    for itr in range(iterations):
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)
        
        # Update the Means
        rhs = V.copy()
        rhs += n[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d,:] = la.inv(isigT + n[d] * A).dot(rhs[d,:])
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n)
        
        like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, queryState
Ejemplo n.º 6
0
def train (data, model, query, plan):
    iterations, burnIn, thin, weightUpdateInterval, _, debug = \
        plan.iterations, plan.burnIn, plan.thin, plan.weightUpdateInterval, plan.logFrequency, plan.debug
    w_list, z_list, docLens = \
        query.w_list, query.z_list, query.docLens
    K, T, weights, topicPrior, vocabPrior, _, _, _, dtype, name = \
        model.K, model.T, model.weights, model.topicPrior, model.vocabPrior, model.topicSum, model.vocabSum, model.numSamples, model.dtype, model.name
    
    assert model.dtype == np.float64, "This is only implemented for 64-bit floats"
    D = docLens.shape[0]
    X = data.feats
    assert docLens.max() < 65536, "This only works for documents with fewer than 65,536 words"

    ndk = np.zeros((D,K), dtype=np.uint16)
    nkv = np.zeros((K,T), dtype=np.int32)
    nk  = np.zeros((K,),  dtype=np.int32)

    num_samples = (iterations - burnIn) // thin
    n_dk_samples = np.zeros((D,K,num_samples), dtype=np.uint16)
    topicSum = np.zeros((D,K), dtype=dtype)
    vocabSum = np.zeros((K,T), dtype=dtype)
    
    compiled.initGlobalRng(0xC0FFEE)
    compiled.sumSuffStats(w_list, z_list, docLens, ndk, nkv, nk)
    
    # Burn in
    alphas = X.dot(weights.T)
    if debug: print ("Burning")
    compiled.sample (burnIn, burnIn + 1, w_list, z_list, docLens, \
            alphas, ndk, nkv, nk, n_dk_samples, topicSum, vocabSum, \
            vocabPrior, False, debug)
    
    # True samples
    if debug: print ("Training")
    sample_count = 0
    for _ in range(0, iterations - burnIn, weightUpdateInterval):
        alphas[:,:] = X.dot(weights.T)
        sample_count += compiled.sample (weightUpdateInterval, thin, w_list, z_list, docLens, \
                alphas, ndk, nkv, nk, n_dk_samples, topicSum, vocabSum, \
                vocabPrior, False, debug)

        if debug: # Print out the perplexity so far
            likely = log_likelihood(data, \
                ModelState (K, T, weights, topicPrior, vocabPrior, n_dk_samples, topicSum, vocabSum, sample_count, dtype, name), \
                QueryState (w_list, z_list, docLens, topicSum, sample_count))
            perp = perplexity_from_like(likely, data)
            print ("Sample-Count = %3d  Perplexity = %7.2f" % (sample_count, perp))

        updateWeights(n_dk_samples, sample_count, X, weights, debug)
    
#     compiled.freeGlobalRng()
    
    return \
        ModelState (K, T, weights, topicPrior, vocabPrior, n_dk_samples, topicSum, vocabSum, sample_count, dtype, name), \
        QueryState (w_list, z_list, docLens, topicSum, sample_count), \
        (np.zeros(1), np.zeros(1), np.zeros(1))
Ejemplo n.º 7
0
    def testCrossValPerplexityOnRealDataWithLdaInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False)

        topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = lda.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan)

                like = lda.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                estData, evalData = queryData.doc_completion_split()
                query = lda.newQueryState(estData, model)
                model, queryResult = lda.query(estData, model, query, queryPlan)

                like = lda.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
Ejemplo n.º 8
0
    def testPerplexityOnRealDataWithLdaInc(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        perps = []
        for K in topicCounts:
            model      = lda.newModelAtRandom(data, K, dtype=dtype)
            queryState = lda.newQueryState(data, model)
            trainPlan  = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
    #        with open(newModelFileFromModel(model), "wb") as f:
    #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            # vocab = lda.wordDists(model)
            # topWordCount = 10
            # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

            like = lda.log_likelihood(data, model, query)
            perp = perplexity_from_like(like, data.word_count)

            perps.append(perp)

            print ("K = %2d : Perplexity = %f\n\n" % (K, perp))
            #
            # for k in range(model.K):
            #     print("\nTopic %d\n=============================" % k)
            #     print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(topicCounts, perps, 'b-')
        ax1.set_xlabel('Topic Count')
        ax1.set_ylabel('Perplexity', color='b')

        fig.show()
        plt.show()
Ejemplo n.º 9
0
def link_split_map (data, mdl, sample_model, train_plan, folds, model_dir = None):
    '''
    Train on all the words and half the links. Predict the remaining links.
    Evaluate using mean average-precision.

    Cross validation may be used, but note we're always evaluating on training
    data.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :return: the list of model files stored
    '''
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    for fold in range(folds):
        model = mdl.newModelFromExisting(sample_model)
        train_data, query_data = data.link_prediction_split(symmetric=False)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models
        train_tops = mdl.newQueryState(train_data, model)
        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs)

        map = mean_average_prec (query_data.links, predicted_link_probs)
        print ("Fold %2d: Mean-Average-Precision %6.3f" % (fold, map))

        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)

    return model_files
Ejemplo n.º 10
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W   = data.words
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    
    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigT_diag.fill (NIW_PSI)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis,:]
            sigT = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            sigT /= (D + pseudoObsVar - K)
        else:
            sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype)
            sigT += np.diag(varcs.mean(axis=0))
           
        if diagonalPriorCov:
            diag = np.diag(sigT)
            sigT = np.diag(diag)
            isigT = np.diag(1./ diag)
        else:
            isigT = la.inv(sigT)

        # FIXME Undo debug
        sigT  = np.eye(K)
        isigT = la.inv(sigT)
        
        debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
#        print("                sigT.det = " + str(la.det(sigT)))
        
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)

        debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the Variances: var_d = (2 N_d * A + isigT)^{-1}
        varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT))
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # Update the Means
        rhs = V.copy()
        rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :])
        
#         means -= (means[:,0])[:,np.newaxis]
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)
            
            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)
            
            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))
        
                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Ejemplo n.º 11
0
def train (dataset, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want it)
    A new query object with the update query parameters
    '''
    W   = dataset.words
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype
    
    # Book-keeping for logs
    boundIters   = np.zeros(shape=(iterations // logFrequency,))
    boundValues  = np.zeros(shape=(iterations // logFrequency,))
    likelyValues = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    
    s.fill(0)
    priorSigt_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigt_diag.fill (0.1)
    kappa = K + 2

    expMeans = means.copy()
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the mean and covariance of the prior
#        topicMean = means.mean(axis = 0)
        topicMean = means.sum(axis=0) / (D + kappa) \
                    if USE_NIW_PRIOR \
                    else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)

        # diff = means - topicMean
        # sigT = diff.T.dot(diff) / D

        sigT, _ = oas(means, assume_centered=False)
        if dtype is not np.float64:
            sigT = sigT.astype(dtype)

        sigT += np.diag(varcs.mean(axis=0))

        if USE_NIW_PRIOR:
            sigT.flat[::K+1] += priorSigt_diag
            sigT += (kappa * D)/(kappa + D) * np.outer(topicMean, topicMean)

        # Building blocks...
        # 1/4 Create the precision matrix from the covariance
        if True or diagonalPriorCov:
            diag = np.diag(sigT)
            sigT = np.diag(diag)
            isigT = np.diag(1. / diag)
        else:
            isigT = la.inv(sigT)
        
        debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
#        print ("         Det sigT = " + str(la.det(sigT)))
        
        # 2/4 temporarily replace means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        # S = expMeans * R.dot(vocab.T)
        
        # 3/4 Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)

        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        S = expMeans * R.dot(vocab.T)
        
        # 4/4 Reset the means to their original form, and log effect of vocab update
        #means = np.log(expMeans, out=expMeans)
        debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # And now this is the E-Step, though it's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the Variances
        varcs = np.reciprocal(n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Means
        vMat   = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + isigT.dot(topicMean)
        # for d in range(D):
        #     means[d,:] = la.inv(isigT + ssp.diags(n[d] * lxi[d,:], 0)).dot(rhsMat[d,:])
        means = varcs * rhsMat

        means -= (means[:,0])[:,np.newaxis]
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = 2 * negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "lxi", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # If so Bouchard's suggested approach of fixing it at zero
        #
        #s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
        debugFn (itr, s, "s", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, lxi, s, n)
            
            boundValues[bvIdx]  = var_bound(dataset, modelState, queryState)
            likelyValues[bvIdx] = log_likelihood(dataset, modelState, queryState)
            boundIters[bvIdx]   = itr
            perp = perplexity_from_like(likelyValues[bvIdx], n.sum())
            
            print (time.strftime('%X') + " : Iteration %5d: Perplexity %4.2f  Bound %10.2f " % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#             print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvment in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] >= 30:
                lastPerp = perplexity_from_like(likelyValues[bvIdx - 1], n.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx)
                    return modelState, queryState, (boundIters, boundValues, likelyValues)
            bvIdx += 1
            
    
    return \
        ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, lxi, s, n), \
        (boundIters, boundValues, likelyValues)
Ejemplo n.º 12
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    boundLikes = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0
    
    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible
    W = W[sortIdx,:] # deep sorted copy
    X = X[sortIdx,:]
    means, varcs = means[sortIdx,:], varcs[sortIdx,:]

    docLens = originalDocLens[sortIdx]
    
    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])
    
    # Initialize some working variables
    R = W.copy()
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    R_A = R_A.todense()      # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")


    diff_m_xa = (means-X.dot(A.T))
    means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa)

    expMeans = np.zeros((BatchSize, K), dtype=dtype)
    R = np.zeros((BatchSize, K), dtype=dtype)
    S = np.zeros((BatchSize, K), dtype=dtype)
    vocabScale = np.ones(vocab.shape, dtype=dtype)
    
    # Iterate over parameters
    batchIter = 0
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += means_cov_with_x_a
        sigT.flat[::K+1] += varcs.sum(axis=0)

        # As small numbers lead to instable inverse estimates, we use the
        # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
        # scales whenever we use the inverse of the unscaled covariance
        sigScale  = 1. / (P+D+F)
        isigScale = 1. / sigScale

        isigT = la.inv(sigT)
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the vocabulary
        # vocab *= vocabScale
        # vocab += vocabPrior
        # vocab = normalizerows_ip(vocab)
        # debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Finally update the parameter V
        V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        
        #
        # And now this is the E-Step
        # 
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the mapping from the features to topics
        A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the Variances
        varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)

        # Faster version?
        vocabScale[:,:] = 0
        means_cov_with_x_a[:,:] = 0
        for lenIdx in range(len(lens)):
            nd         = lens[lenIdx]
            start, end = inds[lenIdx], inds[lenIdx + 1]
            lhs        = la.inv(isigT + sigScale * nd * Ab) * sigScale

            for d in range(start, end, BatchSize):
                end_d = min(d + BatchSize, end)
                span  = end_d - d

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab)
                S[:span,:] = expMeans[:span, :] * R.dot(vocab.T)

                # Convert expMeans to a softmax(means)
                expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis]

                mu   = X[d:end_d,:].dot(A.T)
                rhs  = mu.dot(isigT) * isigScale
                rhs += S[:span,:]
                rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab)
                rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means)

                means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R)

                stepSize = (Tau + batchIter) ** -Kappa
                batchIter += 1

                # Do a gradient update of the vocab
                vocabScale = (R.T.dot(expMeans[:span,:])).T
                vocabScale *= vocab
                normalizerows_ip(vocabScale)
                # vocabScale += vocabPrior
                vocabScale *= stepSize
                vocab *= (1 - stepSize)
                vocab += vocabScale

                diff = (means[d:end_d,:] - mu)
                means_cov_with_x_a += diff.T.dot(diff)

#       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX)
            boundLikes[bvIdx]  = log_likelihood(DataSet(W, feats=X), modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvement in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 20:
                lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx)
                    break
            bvIdx += 1
        
    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means       = means[revert_sort,:]
    varcs       = varcs[revert_sort,:]
    docLens     = docLens[revert_sort]
    
    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
Ejemplo n.º 13
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    W, X = data.words, data.feats
    D, _ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype
    
    # Debugging
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0
    
    # Necessary values
    isigT = la.inv(sigT)

    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    for itr in range(iterations):
        # Counts of topic assignments
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab)
        S = expMeans * R.dot(vocab.T)

        # the variance
        varcs[:] = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1])
        debugFn (itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n)
        
        # Update the Means
        rhs = X.dot(A.T).dot(isigT)
        rhs += S
        rhs += n[:,np.newaxis] * means.dot(Ab)
        rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means)
        
        # Long version
        inverses = dict()
        for d in range(D):
            if not n[d] in inverses:
                inverses[n[d]] = la.inv(isigT + n[d] * Ab)
            lhs = inverses[n[d]]
            means[d,:] = lhs.dot(rhs[d,:])
        debugFn (itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n)

        like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    
    return modelState, queryState # query vars altered in-place
Ejemplo n.º 14
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want it)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats

    assert W.dtype == modelState.dtype
    assert X.dtype == modelState.dtype
    
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    likeValues  = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    
    _debug_with_bound.old_bound = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    sigT_regularizer = 0.001
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    tI_F = 1./fv * ssp.eye(F, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    if ssp.issparse(R_A):
        R_A = R_A.todense()  # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")
    
    s.fill(0)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        diff_m_xa = (means-X.dot(A.T))
        
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += diff_m_xa.T.dot(diff_m_xa)
        sigT.flat[::K+1] += varcs.sum(axis=0)
        sigT /= (P+F+D)
        sigT.flat[::K+1] += sigT_regularizer
        
        # Diagonalize it
        sigT = np.diag(sigT.flat[::K+1])
        # and invert it.
        isigT = np.diag(np.reciprocal(sigT.flat[::K+1]))
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        S = expMeans * R.dot(vocab.T)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Finally update the parameter V
        V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # And now this is the E-Step, though it's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the mapping from the features to topics
        A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Means
        vMat   = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this
        lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] *  lxi)  # inverse of D diagonal matrices...
        means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent
                                # do doing a hadamard product for all d
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Variances
        varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # Follow Bouchard's suggested approach of fixing it at zero
        #
#         s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
#         debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, lxi, s, n)
            
            boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX)
            likeValues[bvIdx]  = log_likelihood(data, modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(likeValues[bvIdx], n.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f  bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#             print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvment in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 50:
                lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx)
                    return modelState, queryState, (boundIters, boundValues, likeValues)
            bvIdx += 1


    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, lxi, s, n), \
        (boundIters, boundValues, likeValues)
Ejemplo n.º 15
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype

    # Necessary temp variables (notably the count of topic to word assignments
    # per topic per doc)
    isigT = la.inv(sigT)
        
    W,X = data.words, data.feats
        
    # Enable logging or not. If enabled, we need the inner product of the feat matrix
    if debug:
        XTX = X.T.dot(X)
        debugFn = _debug_with_bound
        _debug_with_bound.old_bound=0
    else:
        XTX = None
        debugFn = _debug_with_nothing
    
    # Iterate over parameters
    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    for itr in range(iterations):
        # Estimate Z_dvk
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab)
        S = expMeans * R.dot(vocab.T)
        
        # Update the Means
        vMat   = (2  * s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this
        lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * 2 * lxi)  # inverse of D diagonal matrices...
        means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent
                                # to doing a hadamard product for all d
        debugFn (itr, means, "query-means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Variances
        varcs = 1./(2 * n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "query-varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = ctm.negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "query-lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # Follow Bouchard's suggested approach of fixing it at zero
        #
#         s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
#         debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)

        like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, lxi, s, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
Ejemplo n.º 16
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''

    iterations, epsilon, logFrequency, fastButInaccurate, debug, burnIn, thinning = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.burnIn, plan.thinning
    docLens, topicDists = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    W = data.words
    D,T = W.shape

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError("Input document-term matrix contains at least one document with no words")
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    iters, bnds, likes = [], [], []

    sampleCount = 0
    wordDistSamples  = np.zeros((K, T), dtype=np.float64)
    topicDistSamples = np.zeros((D, K), dtype=np.float64)

    for itr in range(plan.iterations + plan.burnIn):
        topicDists = sample_memberships(W, topicPrior, wordDists, topicDists)
        wordDists = sample_dirichlet(W, vocabPrior, topicDists, wordDists)


        if is_sampling_iteration(itr, plan):
            wordDistSamples  += wordDists
            topicDistSamples += topicDists
            sampleCount += 1


        if itr % logFrequency == 0 or debug:
            m = ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, model.name)
            q = QueryState(query.docLens, topicDists, True)

            iters.append(itr)
            bnds.append(var_bound(data, m, q))
            likes.append(log_likelihood(data, m, q))

            perp = perplexity_from_like(likes[-1], W.sum())
            print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" % (itr, perp, bnds[-1]))

            # if len(iters) > 2 and iters[-1] > 50:
            #     lastPerp = perplexity_from_like(likes[-2], W.sum())
            #     if lastPerp - perp < 1:
            #         break;

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, model.name), \
           QueryState(query.docLens, topicDists, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
Ejemplo n.º 17
0
def _old_train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists ,dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    assert model.dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W   = data.words
    D,T = W.shape

    iters, bnds, likes = [], [], []
    
    # A few parameters for handling adaptive step-sizes in SGD
    grad = 0
    grad_inner = 0
    grad_rate = 1
    log_likely = 0 # complete dataset likelihood for gradient adjustments
    stepSize = np.array([1.] * K, dtype=model.dtype)

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    diWordDistSums = np.empty((K,), dtype=dtype)
    diWordDists    = np.empty(wordDists.shape, dtype=dtype)
    wordUpdates    = wordDists.copy() if batchSize > 0 else None
    batchProcessCount = 0

    # Amend the name if batchSize == 0 implying we're using SGD
    modelName = "lda/svbp/%s" % _sgd_desc(plan) \
                if batchSize > 0 else model.name
    print (modelName)

    for itr in range(iterations):
        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists,      out=diWordDists)

        if updateVocab:
            # Perform inference, updating the vocab
            if batchSize == 0:
                wordDists[:, :] = vocabPrior
            else:
                wordUpdates[:,:] = 0

            for d in range(D):
                batchProcessCount += 1
                #if debug and d % 100 == 0: printAndFlushNoNewLine(".")
                wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums)
                wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

                if plan.rate_algor == RateAlgorAmaria:
                    log_likely += 0
                elif plan.rate_algor == RateAlgorVariance:
                    g = wordDists.mean(axis=0) + vocabPrior
                    grad *= (1 - grad_rate)
                    grad += grad_rate * wordDists
                    grad += grad_rate * vocabPrior
                    gg += 0
                elif plan.rate_algor != RateAlgorTimeKappa:
                    raise ValueError("Unknown rate algorithm " + str(plan.rate_algor))

                if batchSize > 0 and batchProcessCount == batchSize:
                    batch_index    = (itr * D + d) / batchSize #TODO  Will not be right if batchSize is not a multiple of D
                    stepSize       = _step_sizes(stepSize, batch_index, g, gg, log_likely, plan)
                    wordDists     *= (1 - stepSize)
                    wordDists     += stepSize * vocabPrior

                    stepSize      *= float(D) / batchSize
                    wordUpdates   *= stepSize
                    wordDists     += wordUpdates

                    diWordDistSums[:] = wordDists.sum(axis=1)
                    fns.digamma(diWordDistSums, out=diWordDistSums)
                    fns.digamma(wordDists, out=diWordDists)

                    wordUpdates[:,:] = 0
                    batchProcessCount = 0
                    log_likely = 0

                    if debug:
                        bnds.append(_var_bound_internal(data, model, query))
                        likes.append(_log_likelihood_internal(data, model, query))

                        perp = perplexity_from_like(likes[-1], W.sum())
                        print("Iteration %d, after %d docs: Train Perp = %4.0f  Bound = %.3f" % (itr, batchSize, perp, bnds[-1]))
                        sys.stdout.flush()


            # Log bound and the determine if we can stop early
            if itr % logFrequency == 0 or debug:
                iters.append(itr)
                bnds.append(_var_bound_internal(data, model, query))
                likes.append(_log_likelihood_internal(data, model, query))

                perp = perplexity_from_like(likes[-1], W.sum())
                print ("Iteration %d : Train Perp = %4.0f  Bound = %.3f" % (itr, perp, bnds[-1]))

                if len(iters) > 2 and (iters[-1] > 20 or (iters[-1] > 2 and batchSize > 0)):
                    lastPerp = perplexity_from_like(likes[-2], W.sum())
                    if lastPerp - perp < 1:
                        print ("Converged, existing early")
                        break;

            # Update hyperparameters (do this after bound, to make sure bound
            # calculation is internally consistent)
            if HyperUpdateEnabled and itr > 0 and itr % HyperParamUpdateInterval == 0:
                if debug: print("Topic Prior was " + str(topicPrior))
                _updateTopicHyperParamsFromMeans(model, query)
                if debug: print("Topic Prior is now " + str(topicPrior))
        else:
            for d in range(D):
                _ = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums)

    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \
           QueryState(docLens, topicMeans, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
Ejemplo n.º 18
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs. The assumption is that there are no out-links associated
    with the documents, and that no documents in the training set link
    to any of these documents in the query set.

    The word and link vocabularies are kept fixed. Due to the assumption
    of no in-links, we don't learn the prior in-document covariance, nor
    the posterior distribution over in-links. Also, we don't modify

    
    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats
    D,_ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens
    K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Initialize some working variables
    W_weight  = W.copy()

    outDocPre = 1./outDocCov
    inDocPre  = np.reciprocal(inDocCov)
    itopicCov = la.inv(topicCov)

    # Iterate over parameters
    for itr in range(iterations):
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis])
        W_weight   = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        # Update the posterior variances
        outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:])

        # Update the out-means and in-means
        out_rhs  = w_top_sums.copy()
        # No link outputs to model.
        out_rhs += itopicCov.dot(topicMean) / outDocCov
        out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans))

        for d in range(D):
            outCov          = la.inv(outDocPre * itopicCov + emit_counts[d] * A)
            outMeans[d, :]  = outCov.dot(out_rhs[d,:])

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME)
            queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)

            boundValues.append(0)
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \
        QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)
Ejemplo n.º 19
0
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Take a feature list. Train on all documents where none of those features
    are set. Remove the first element from the feature list, query all documents
    with that feature set, and then evaluate link prediction. Repeat until
    feature-list is empty.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param feature_mask:  the list of features used to separate training from query
    This is a list of tuples, the left side is the feature label, the right side
    is the
    :param model_dir: if not none, the models are stored in this directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    while len(feature_mask) > 0:
        # try:
        # Prepare the training and query data
        feature_mask_indices = [i for _,i in feature_mask]
        train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices)
        (feat_label, feat_id) = feature_mask.pop(0)
        print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80))

        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        # Train the model
        if model_uses_lda(sample_model):
            ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices)
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        query_tops    = mdl.newQueryState(query_data, model)
        _, query_tops = mdl.query(query_data, model, query_tops, train_plan)

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs)
        expected_links       = query_data.links

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
Ejemplo n.º 20
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      - 

    Return:
    The updated model object (note parameters are updated in place, so make a 
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug           
    W_list, docLens, topicDists = \
        queryState.W_list, queryState.docLens, queryState.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype

    W   = data.words
    D,T = W.shape
    
    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    
    # Book-keeping for logs
    logPoints    = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters   = np.zeros(shape=(logPoints,))
    boundValues  = np.zeros(shape=(logPoints,))
    likelyValues = np.zeros(shape=(logPoints,))
    bvIdx = 0
    
    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part. 
    z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F')
 
    # Select the training iterations function appropriate for the dtype
    current_micro_time = lambda: int(time.time())
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64
#    do_iterations = iterate # pure Python
    
    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters  = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    totalItrs = 0
    for segment in range(logPoints - 1):
        start = current_micro_time()
        totalItrs += do_iterations (segIters, D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)
        
        duration = current_micro_time() - start
    
        boundIters[bvIdx]   = segment * segIters
        boundValues[bvIdx]  = var_bound(data, modelState, queryState)
        likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)
        perp = perplexity_from_like(likelyValues[bvIdx], W.sum())
        bvIdx += 1
        
        if converged (boundIters, boundValues, bvIdx, epsilon, minIters=20):
            boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx)
            return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
                QueryState(W_list, docLens, topicDists), \
                (boundIters, boundValues, likelyValues)
        
        print ("Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f" % (segment, logPoints, totalItrs, duration, perp, boundValues[bvIdx - 1], likelyValues[bvIdx - 1]))
    
    # Final batch of iterations.
    do_iterations (remainder, D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)
    
    boundIters[bvIdx]   = iterations - 1
    boundValues[bvIdx]  = var_bound(data, modelState, queryState)
    likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)
   
            
    return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, topicDists), \
           (boundIters, boundValues, likelyValues)
Ejemplo n.º 21
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype

    W = data.words

    iters, bnds, likes = [], [], []

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError("Input document-term matrix contains at least one document with no words")
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    for itr in range(iterations):
        # E-Step
        safe_log(wordDists, out=wordDists)
        safe_log(corpusTopicDist, out=corpusTopicDist)

        topicDists = W.dot(wordDists.T) + corpusTopicDist[np.newaxis, :]
        #topicDists -= topicDists.max(axis=1)[:, np.newaxis] # TODO Ensure this is okay
        norms = fns.logsumexp(topicDists, axis=1)
        topicDists -= norms[:, np.newaxis]

        np.exp(topicDists, out=topicDists)

        # M-Step
        wordDists = (W.T.dot(topicDists)).T
        wordDists += vocabPrior
        wordDists /= wordDists.sum(axis=1)[:, np.newaxis]

        corpusTopicDist     = topicDists.sum(axis=0)
        corpusTopicDist[:] += topicPrior
        corpusTopicDist    /= corpusTopicDist.sum()

        if itr % logFrequency == 0 or debug:
            m = ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name)
            q = QueryState(query.docLens, topicDists, True)

            iters.append(itr)
            bnds.append(var_bound(data, m, q))
            likes.append(log_likelihood(data, m, q))

            perp = perplexity_from_like(likes[-1], W.sum())
            print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" % (itr, perp, bnds[-1]))

            if len(iters) > 2 and iters[-1] > 50:
                lastPerp = perplexity_from_like(likes[-2], W.sum())
                if lastPerp - perp < 1:
                    break;

    return ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name), \
           QueryState(query.docLens, topicDists, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
Ejemplo n.º 22
0
    def testMapOnRealData(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            dic = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.convert_to_undirected_graph()
        data.convert_to_binary_link_matrix()
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        trainData, testData = data.doc_completion_split()

        for pseudoNegCount in (5, 10, 25, 50, 100):
            rd.seed(0xC0FFEE)

            # Initialise the model
            K = TopicCount
            model      = rtm.newModelAtRandom(trainData, K, dtype=dtype, pseudoNegCount=data.doc_count * pseudoNegCount)
            queryState = rtm.newQueryState(trainData, model)
            trainPlan  = rtm.newTrainPlan(iterations=50, logFrequency=LogFreq, fastButInaccurate=False, debug=True)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, topics, (bndItrs, bndVals, bndLikes) = rtm.train(trainData, model, queryState, trainPlan)
    #        with open(newModelFileFromModel(model), "wb") as f:
    #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Plot the evolution of the bound during training.
            fig, ax1 = plt.subplots()
            ax1.plot(bndItrs, bndVals, 'b-')
            ax1.set_xlabel('Iterations')
            ax1.set_ylabel('Bound', color='b')

            ax2 = ax1.twinx()
            ax2.plot(bndItrs, bndLikes, 'r-')
            ax2.set_ylabel('Likelihood', color='r')

            fig.show()
            plt.show()

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            vocab = rtm.wordDists(model)
            topWordCount = 10
            kTopWordInds = [self.topWordInds(vocab[k, :], topWordCount) for k in range(K)]

            like = rtm.log_likelihood(trainData, model, topics)
            perp = perplexity_from_like(like, trainData.word_count)

            # print ("Prior %s" % (str(model.topicPrior)))
            print ("Pseudo Neg-Count: %d " % pseudoNegCount)
            print ("\tTrain Perplexity: %f\n\n" % perp)

            # for k in range(model.K):
            #     print ("\nTopic %d\n=============================" % k)
            #     print ("\n".join("%-20s\t%0.4f" % (dic[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

            min_probs  = rtm.min_link_probs(model, topics, testData.links)
            link_probs = rtm.link_probs(model, topics, min_probs)
            try:
                map = mean_average_prec(testData.links, link_probs)
            except:
                print ("Unexpected error")

            print("\tThe Mean-Average-Precision is %.3f" % map)
Ejemplo n.º 23
0
    def testPerplexityOnRealDataWithMtm2(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        K = 30 # TopicCount
        model      = mtm2.newModelAtRandom(data, K, dtype=dtype)
        queryState = mtm2.newQueryState(data, model)
        trainPlan  = mtm2.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=False)

        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = mtm2.train(data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')

        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')

        fig.show()
        plt.show()

        fig, ax1 = plt.subplots()
        ax1.imshow(model.topicCov, interpolation="nearest", cmap=cm.Greys_r)
        fig.show()
        plt.show()

        # Print out the most likely topic words
        # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
        vocab = mtm2.wordDists(model)
        topWordCount = 10
        kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

        like = mtm2.log_likelihood(data, model, query)
        perp = perplexity_from_like(like, data.word_count)

        print("Perplexity: %f\n\n" % perp)

        for k in range(model.K):
            print("\nTopic %d\n=============================" % k)
            print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

        print ("Most likely documents for each topic")
        print ("====================================")
        with open ("/Users/bryanfeeney/iCloud/Datasets/ACL/ACL.100/doc_ids.pkl", 'rb') as f:
            fileIds = pkl.load (f)
        docs_dict = [fileIds[fi] for fi in data.order]

        for k in range(model.K):
            arg_max_prob = np.argmax(query.means[:, k])
            print("K=%2d  Document ID = %s (found at %d)" % (k, docs_dict[arg_max_prob], arg_max_prob))

        print ("Done")

        with open ("/Users/bryanfeeney/Desktop/mtm2-" + str(K) + ".pkl", "wb") as f:
            pkl.dump((model, query), f)
Ejemplo n.º 24
0
def link_split_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Train on all the words and half the links. Predict the remaining links.
    Evaluate using precision at m using as values of m 50, 100, 250, and 500,
    and additionally recall at m

    Cross validation may be used, but note we're always evaluating on training
    data.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param target_folds: the number of folds to complete before finishing. Set
    to folds by default
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    if ldaModel is not None:
        (_, _, _, _, ldaModel, ldaTopics, _) = ldaModel
    if target_folds is None:
        target_folds = folds

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    for fold in range(target_folds):
        model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) \
                if sample_model.name == LRO_MODEL_NAME \
                else mdl.newModelFromExisting(sample_model)

        train_data, query_data = data.link_prediction_split(symmetric=False)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models
        train_tops = mdl.newQueryState(train_data, model)
        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs)
        expected_links       = query_data.links

        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print ("Fold %2d: Mean-Precisions at \n" % fold, end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))

    return model_files
Ejemplo n.º 25
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats
    D,_ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens
    K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    if debug:
        debugFn = _debug_with_bound

        initLikely = log_likelihood(data, modelState, queryState)
        initPerp   = perplexity_from_like(initLikely, data.word_count)
        print ("Initial perplexity is: %.2f" % initPerp)
    else:
        debugFn = _debug_with_nothing

    # Initialize some working variables
    W_weight  = W.copy()
    L_weight  = L.copy()
    LT_weight = LT.copy()

    inDocCov,  inDocPre  = np.ones((D,)), np.ones((D,))

    # Interestingly, outDocCov trades off good perplexity fits
    # with good ranking fits. > 10 gives better perplexity and
    # worse ranking. At 10 both are good. Below 10 both get
    # worse. Below 0.5, convergence stalls after the first iter.
    outDocCov, outDocPre = 10, 1./10

    # Iterate over parameters
    for itr in range(iterations):
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the mean and covariance of the prior over out-topics
        topicMean = outMeans.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        outDiff = outMeans - topicMean[np.newaxis, :]
        inDiff =  inMeans - outMeans

        for _ in range(5): # It typically takes three iterations for the three dependant covariances -
                           # outDocCov, inDocCov and topicCov - to become consistent w.r.t each other
            topicCov  = (outDocPre * outDiff).T.dot(outDiff)
            topicCov += (inDocPre[:,np.newaxis] * inDiff).T.dot(inDiff)

            topicCov += np.diag(outVarcs.sum(axis=0))
            topicCov += np.diag(inVarcs.sum(axis=0))

            topicCov += IWISH_S_SCALE * np.eye(K)
            topicCov /= (2 * D + IWISH_DENOM)
            itopicCov = la.inv(topicCov)

            debugFn (itr, topicMean, "topicCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

            diffSig   = inDiff.dot(itopicCov)
            diffSig  *= inDiff

            inDocCov  = diffSig.sum(axis=1)
            inDocCov += (outVarcs * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1)
            inDocCov += (inVarcs  * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1)
            inDocCov += IGAMMA_B
            inDocCov /= (IGAMMA_A - 1 + K)
            inDocPre  = np.reciprocal(inDocCov)

            debugFn (itr, inDocCov, "inDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

            diffSig   = outDiff.dot(itopicCov)
            diffSig  *= outDiff
            # outDocCov = (IGAMMA_B + diffSig.sum() + (np.diagonal(itopicCov) * outVarcs).sum()) / (IGAMMA_A - 1 + (D * K))
            # outDocPre = 1./outDocCov

            debugFn (itr, outDocCov, "outDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)


        # Apply the exp function to get the (unnormalised) softmaxes in both directions.
        expMeansCol = np.exp(inMeans - inMeans.max(axis=0)[np.newaxis, :])
        lse_at_k = np.sum(expMeansCol, axis=0)
        F = 0.5 * inMeans \
          - (0.5/ D) * inMeans.sum(axis=0) \
          - expMeansCol / lse_at_k[np.newaxis, :]

        expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis])
        W_weight   = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)

        # Update the vocabularies

        vocab *= (W_weight.T.dot(expMeansRow)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += VocabPrior
        vocab = normalizerows_ip(vocab)

        docVocab = (expMeansCol / lse_at_k[np.newaxis, :]).T.copy() # FIXME Dupes line in definition of F

        # Recalculate w_top_sums with the new vocab and log vocab improvement
        W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        debugFn (itr, vocab, "vocab", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        # Now do likewise for the links, do it twice to model in-counts (first) and
        # out-counts (Second). The difference is the transpose
        LT_weight    = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight)
        l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow
        in_counts    = l_intop_sums.sum(axis=0)

        L_weight     = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight)
        l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow


        # Update the posterior variances
        outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:])
        debugFn (itr, outVarcs, "outVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        inVarcs = np.reciprocal(in_counts[np.newaxis,:] * (D-1)/(2*D) + inDocPre[:,np.newaxis] * np.diagonal(itopicCov)[np.newaxis,:])
        debugFn (itr, inVarcs, "inVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        # Update the out-means and in-means
        out_rhs  = w_top_sums.copy()
        out_rhs += l_outtop_sums
        out_rhs += itopicCov.dot(topicMean) / outDocCov
        out_rhs += inMeans.dot(itopicCov) / inDocCov[:,np.newaxis]
        out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans))

        scaled_n_in = ((D-1.)/(2*D)) * ssp.diags(in_counts, 0)
        in_rhs = (inDocPre[:, np.newaxis] * outMeans).dot(itopicCov)
        in_rhs += ((-inMeans.sum(axis=0) * in_counts) / (4*D))[np.newaxis,:]
        in_rhs += l_intop_sums
        in_rhs += in_counts[np.newaxis, :] * F
        for d in range(D):
            in_rhs[d, :]  += in_counts * inMeans[d, :] / (4*D)
            inMeans[d, :]  = la.inv(inDocPre[d] * itopicCov + scaled_n_in).dot(in_rhs[d, :])
            in_rhs[d,:]   -= in_counts * inMeans[d, :] / (4*D)

            try:
                outCov          = la.inv((outDocPre + inDocPre[d]) * itopicCov + emit_counts[d] * A)
                outMeans[d, :]  = outCov.dot(out_rhs[d,:])
            except la.LinAlgError as err:
                print ("ABORTING: " + str(err))
                return \
                    ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \
                    QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \
                    (np.array(boundIters), np.array(boundValues), np.array(likelyValues))


        debugFn (itr, outMeans, "inMeans/outMeans", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)
        # debugFn (itr, inMeans,  "inMeans",  data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME)
            queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

        # if True or debug or itr % logFrequency == 0:
        #     print("   Sigma     %6.1f  \t %9.3g, %9.3g, %9.3g" % (np.log(la.det(topicCov)), topicCov.min(), topicCov.mean(), topicCov.max()), end="  |")
        #     print("   rho       %6.1f  \t %9.3g, %9.3g, %9.3g" % (sum(log(inDocCov[d]) for d in range(D)), inDocCov.min(), inDocCov.mean(), inDocCov.max()), end="  |")
        #     print("   alpha     %6.1f  \t %9.3g" % (np.log(la.det(np.eye(K,) * outDocCov)), outDocCov), end="  |")
        #     print("   inMeans   %9.3g, %9.3g, %9.3g" % (inMeans.min(),  inMeans.mean(),  inMeans.max()), end="  |")
        #     print("   outMeans  %9.3g, %9.3g, %9.3g" % (outMeans.min(), outMeans.mean(), outMeans.max()), end="  |")
        #     print("   inVarcs   %6.1f  \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(inVarcs[d]))  for d in range(D)) / D, inVarcs.min(),  inVarcs.mean(),  inVarcs.max()), end="  |")
        #     print("   outVarcs  %6.1f  \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(outVarcs[d])) for d in range(D)) / D, outVarcs.min(), outVarcs.mean(), outVarcs.max()))

    return \
        ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \
        QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Ejemplo n.º 26
0
def train(data, modelState, queryState, trainPlan):
    """
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    """
    W, X = data.words, data.feats
    D, T = W.shape
    F = X.shape[1]

    # tmpNumDense = np.array([
    #     4	, 8	, 2	, 0	, 0,
    #     0	, 6	, 0	, 17, 0,
    #     12	, 13	, 1	, 7	, 8,
    #     0	, 5	, 0	, 0	, 0,
    #     0	, 6	, 0	, 0	, 44,
    #     0	, 7	, 2	, 0	, 0], dtype=np.float64).reshape((6,5))
    # tmpNum = ssp.csr_matrix(tmpNumDense)
    #
    # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10
    # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64)
    #
    # tmpResult = tmpNum.copy()
    # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight)
    #
    # print (str(tmpNum.todense()))
    # print (str(tmpDenomleft.dot(tmpDenomRight)))
    # print (str(tmpResult.todense()))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = (
        trainPlan.iterations,
        trainPlan.epsilon,
        trainPlan.logFrequency,
        trainPlan.fastButInaccurate,
        trainPlan.debug,
    )
    means, docLens = queryState.means, queryState.docLens
    K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = (
        modelState.K,
        modelState.A,
        modelState.U,
        modelState.Y,
        modelState.V,
        modelState.covA,
        modelState.tv,
        modelState.ltv,
        modelState.fv,
        modelState.lfv,
        modelState.vocab,
        modelState.vocabPrior,
        modelState.dtype,
    )

    tp, fp, ltp, lfp = 1.0 / tv, 1.0 / fv, 1.0 / ltv, 1.0 / lfv  # turn variances into precisions

    # FIXME Use passed in hypers
    print("tp = %f tv=%f" % (tp, tv))
    vocabPrior = np.ones(shape=(T,), dtype=modelState.dtype)

    # FIXME undo truncation
    F = 363
    A = A[:F, :]
    X = X[:, :F]
    U = U[:F, :]
    data = DataSet(words=W, feats=X)

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    debugFn = _debug_with_bound if debug else _debug_with_nothing

    # Initialize some working variables
    if covA is None:
        precA = (fp * ssp.eye(F) + X.T.dot(X)).todense()  # As the inverse is almost always dense
        covA = la.inv(precA, overwrite_a=True)  # it's faster to densify in advance
    uniqLens = np.unique(docLens)

    debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

    H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K)

    expMeans = means.copy()
    expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy())

    lhs = H.copy()
    rhs = expMeans.copy()
    Y_rhs = Y.copy()

    # Iterate over parameters
    for itr in range(iterations):

        # Update U, V given A
        V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T
        V /= V[0, 0]
        U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T

        # Update Y given U, V, A
        Y_rhs[:, :] = U.T.dot(A).dot(V)

        Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True)
        Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True)

        s = np.outer(Sv, Su).flatten()
        s += ltv * lfv
        np.reciprocal(s, out=s)

        M = Uu.T.dot(Y_rhs).dot(Uv)
        M *= unvec(s, row_count=M.shape[0])

        Y = Uu.dot(M).dot(Uv.T)
        debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

        A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means))
        debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # TODO One big sort by size, plus batch it.

        # Update the Means

        rhs[:, :] = expMeans
        rhs *= R.dot(vocab.T)
        rhs += X.dot(A) * tp
        rhs += docLens[:, np.newaxis] * means.dot(H)
        rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)
        for l in uniqLens:
            inds = np.where(docLens == l)[0]
            lhs[:, :] = l * H
            lhs[np.diag_indices_from(lhs)] += tp
            lhs[:, :] = la.inv(lhs)
            means[inds, :] = rhs[inds, :].dot(lhs)  # left and right got switched going from vectors to matrices :-/

        debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)

        # Standard deviation
        # DK        = means.shape[0] * means.shape[1]
        # newTp     = np.sum(means)
        # newTp     = (-newTp * newTp)
        # rhs[:,:]  = means
        # rhs      *= means
        # newTp     = DK * np.sum(rhs) - newTp
        # newTp    /= DK * (DK - 1)
        # newTp     = min(max(newTp, 1E-36), 1E+36)
        # tp        = 1 / newTp
        # if itr % logFrequency == 0:
        #     print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,)))))

        # Update the vocabulary
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)

        vocab *= (R.T.dot(expMeans)).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)

        debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior)
        # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min()))

        # Update the vocab prior
        # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior)
        # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean()))

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name)
            queryState = QueryState(means, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime("%X")
                + " : Iteration %d: bound %f \t Perplexity: %.2f"
                % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))
            )
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug:
                        printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if (
                    itr > 100
                    and len(likelyValues) > 3
                    and abs(
                        perplexity_from_like(likelyValues[-1], docLens.sum())
                        - perplexity_from_like(likelyValues[-2], docLens.sum())
                    )
                    < 1.0
                ):
                    break

    return (
        ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name),
        QueryState(means, expMeans, docLens),
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues)),
    )
Ejemplo n.º 27
0
def query(dataset, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    W = dataset.words
    D = W.shape[0]
    
    iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype
    
    # Necessary temp variables (notably the count of topic to word assignments
    # per topic per doc)
    isigT = la.inv(sigT)
    expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab)
    S = expMeans * R.dot(vocab.T)
        
    # Enable logging or not. If enabled, we need the inner product of the feat matrix
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Iterate over parameters
    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    for itr in range(iterations):
        # Update the Means
        vMat   = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + isigT.dot(topicMean)
        for d in range(D):
            try:
                means[d,:] = la.inv(isigT + ssp.diags(n[d] * lxi[d,:], 0)).dot(rhsMat[d,:])
            except ValueError as e:
                print(str(e))
                print ("Ah")
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Variances
        varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = 2 * negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "lxi", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # Follow Bouchard's suggested approach of fixing it at zero
        #
        s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
        debugFn (itr, s, "s", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)

        like = log_likelihood(dataset, modelState, QueryState(means, expMeans, varcs, lxi, s, n))
        perp = perplexity_from_like(like, dataset.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
Ejemplo n.º 28
0
def insample_lro_style_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    For documents with > 5 links remove a portion. The portion is determined by
    the number of folds (e.g. five-fold implied remove one fifth of links, three
    fold implies remove a third, etc.)

    Train on all documents and all remaining links.

    Predict remaining links.

    Evaluate using precision@m, recall@m, mean reciprocal-rank and
    mean average-precision

    Average all results.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param folds:  the number of folds to cross validation
    :param target_folds: the number of folds to complete before finishing. Set
    to folds by default
    :param model_dir: if not none, and folds > 1, the models are stored in this
    directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []
    assert folds > 1, "Need at least two folds for this to make any sense whatsoever"
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    if target_folds is None:
        target_folds = folds

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    fold_count = 0
    for fold in range(folds):
        # try:
        # Prepare the training and query data
        train_data, query_data, docSubset = data.folded_link_prediction_split(MinLinkCountEval, fold, folds)
        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        print ("\n\nFold %d\n" % fold + ("-" * 80))

        # Train the model
        if model_uses_lda(sample_model):
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopics)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        min_link_probs       = mdl.min_link_probs(model, train_tops, train_tops, query_data.links, docSubset)
        predicted_link_probs = mdl.link_probs(model, train_tops, train_tops, min_link_probs, docSubset)
        expected_links       = query_data.links[docSubset, :]

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print ("Fold %2d: Mean-Precisions at \n" % fold, end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        fold_count += 1
        if fold_count == target_folds:
            break
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
Ejemplo n.º 29
0
def train (data, modelState, queryState, trainPlan, query=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      - 

    Return:
    The updated model object (note parameters are updated in place, so make a 
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug           
    W_list, docLens, q_n_dk, q_n_kt, q_n_k, q_v_dk, q_v_kt, q_v_k, z_dnk = \
        queryState.W_list, queryState.docLens, \
        queryState.n_dk, queryState.n_kt, queryState.n_k, \
        queryState.v_dk, queryState.v_kt, queryState.v_k, queryState.z_dnk
    K, topicPrior_, vocabPrior, m_n_dk, m_n_kt, m_n_k, m_v_dk, m_v_kt, m_v_k = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, \
        modelState.n_dk, modelState.n_kt, modelState.n_k, \
        modelState.v_dk, modelState.v_kt, modelState.v_k
    topicPrior = topicPrior_.mean()
    
    D_train = 0 if m_n_dk is None else m_n_dk.shape[0]
    D_query = q_n_dk.shape[0]
    W = data.words
    T = W.shape[1]
    
    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError ("Input document-term matrix contains at least one document with no words")
    
    # Book-keeping for logs
    logPoints    = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters   = []
    boundValues  = []
    likelyValues = []
    
    # Early stopping check
    finishedTraining = False
    
    # Add the model counts (essentially the learnt model parameters) to those for
    # the query, assuming the model has been trained previously
    if m_n_dk is not None:
        np.add (q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt += m_n_kt
        np.add (q_v_kt, m_v_kt, out=q_v_kt)
        np.add (q_n_k,  m_n_k,  out=q_n_k)  # q_n_k  += m_n_k
        np.add (q_v_k,  m_v_k,  out=q_v_k)
    
#     print ("Topic prior : " + str(topicPrior))
    
    # Select the training iterations function appropriate for the dtype
    if debug: print ("Starting Training")
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64
    
    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters  = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    for segment in range(logPoints - 1):
        do_iterations (segIters, D_query, D_train, K, T, \
                       W_list, docLens, \
                       q_n_dk, q_n_kt, q_n_k, \
                       q_v_dk, q_v_kt, q_v_k, \
                       z_dnk,\
                       topicPrior, vocabPrior)

        
        # Measure and record the improvement to the bound and log-likely
        boundIters.append   (segment * segIters)
        boundValues.append  (var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k))
        likelyValues.append (log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k))
        
        # Check to see if the improvement in the bound has fallen below the threshold
        perp = perplexity_from_like(likelyValues[-1], W.sum())
        print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" % (segment * segIters, perp, boundValues[-1]))

        if len(boundIters) > 2 and (boundIters[-1] > 30):
            lastPerp = perplexity_from_like(likelyValues[-2], W.sum())
            if lastPerp - perp < 1:
                finishedTraining = True
                print("Converged, existing early")
                break
    
    # Final scheduled batch of iterations if we haven't already converged.
    if not finishedTraining:
        do_iterations (remainder, D_query, D_train, K, T, \
                   W_list, docLens, \
                   q_n_dk, q_n_kt, q_n_k, \
                   q_v_dk, q_v_kt, q_v_k, \
                   z_dnk,\
                   topicPrior, vocabPrior)
    
        boundIters.append   (iterations - 1)
        boundValues.append  (var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k))
        likelyValues.append (log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k))
        
    # Now return the results
    if query: # Model is unchanged, query is changed
        if m_n_dk is not None:
            np.subtract(q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt -= m_n_kt
            np.subtract(q_v_kt, m_v_kt, out=q_v_kt)
            np.subtract(q_n_k,  m_n_k,  out=q_n_k)  # q_n_k  -= m_n_k
            np.subtract(q_v_k,  m_v_k,  out=q_v_k)  # q_n_k  -= m_n_k
    else: # train # Model is changed. Query is changed
        if m_n_dk is not None: # Amend existing
            m_n_dk = np.vstack((m_n_dk, q_n_dk))
            m_n_kt[:,:] = q_n_kt # Recall we _added_ the m_n_kt counts to the query
            m_n_k[:]    = q_n_k  # before training, so now the query-counts contain the
                                 # sum of old and new, and can just be copied across
            m_v_dk = np.vstack((m_v_dk, q_v_dk))
            m_v_kt[:,:] = q_v_kt
            m_n_k[:]    = q_v_k
        else:                  # Create from scratch
            m_n_dk = q_n_dk.copy()
            m_n_kt = q_n_kt.copy()
            m_n_k  = q_n_k.copy()

            m_v_dk = q_v_dk.copy()
            m_v_kt = q_v_kt.copy()
            m_v_k  = q_v_k.copy()
            
    return ModelState(K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k, m_v_dk, m_v_kt, m_v_k, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, q_n_dk, q_n_kt, q_n_k, q_v_dk, q_v_kt, q_v_k, z_dnk), \
           (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
Ejemplo n.º 30
0
def cross_val_and_eval_perplexity(data, mdl, sample_model, train_plan, query_plan, num_folds, fold_run_count=-1, model_dir= None):
    '''
    Uses cross-validation go get the average perplexity. If folds == 1 a special path is
    triggered where perplexity is evaluated on the training data, and the results are
    not saved to disk, even if model_dir is not none

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param query_plan:  the query play (number of iterations etc.)
    :param num_folds:  the number of folds to cross validation
    :param fold_run_count: for debugging stop early after processing the number
    of the folds
    :param model_dir: if not none, the models are stored in this directory.
    :return: the list of model files stored
    '''
    model_files = []
    if fold_run_count < 1:
        fold_run_count = num_folds

    if num_folds == 1:
        model = mdl.newModelFromExisting(sample_model)
        query = mdl.newQueryState(data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = mdl.train(data, model, query, train_plan)
        likely = mdl.log_likelihood(data, model, train_tops)
        perp   = perplexity_from_like(likely, data.word_count)

        print("Train-set Likelihood: %12f" % (likely))
        print("Train-set Perplexity: %12f" % (perp))

        model_files = save_if_necessary(model_files, model_dir, model, data, 0, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        return model_files

    query_like_sum    = 0 # to calculate the overall likelihood and
    query_wcount_sum  = 0 # perplexity for the whole dataset
    train_like_sum    = 0
    train_wcount_sum  = 0
    folds_finished    = 0 # count of folds that finished successfully

    fold = 0
    while fold < num_folds and folds_finished < fold_run_count:
        try:
            train_data, query_data = data.cross_valid_split(fold, num_folds)

            # Train the model
            print ("Duplicating model template... ", end="")
            model      = mdl.newModelFromExisting(sample_model)
            print ("Done.\nCreating query state...")
            train_tops = mdl.newQueryState(train_data, model)

            print ("Starting training")

            model, train_tops, (train_itrs, train_vbs, train_likes) \
                = mdl.train(train_data, model, train_tops, train_plan)

            train_like       = mdl.log_likelihood (train_data, model, train_tops)
            train_word_count = train_data.word_count
            train_perp       = perplexity_from_like(train_like, train_word_count)

            print ("DEBUG Train perplexity is " + str(train_perp))

            # Query the model - if there are no features we need to split the text
            print ("Starting query.")
            query_estim, query_eval = query_data.doc_completion_split()
            query_tops              = mdl.newQueryState(query_estim, model)
            model, query_tops = mdl.query(query_estim, model, query_tops, query_plan)

            query_like       = mdl.log_likelihood(query_eval, model, query_tops)
            query_word_count = query_eval.word_count
            query_perp       = perplexity_from_like(query_like, query_word_count)

            # Keep a record of the cumulative likelihood and query-set word-count
            train_like_sum += train_like
            train_wcount_sum  += train_word_count
            query_like_sum += query_like
            query_wcount_sum  += query_word_count
            folds_finished  += 1

            # Write out the output
            print("Fold %d: Train-set Perplexity: %12.3f \t Query-set Perplexity: %12.3f" % (fold, train_perp, query_perp))
            print("")

            # Save the model
            model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, query_tops, mdl)
        # except Exception as e:
        #     traceback.print_exc()
        #     print("Abandoning fold %d due to the error : %s" % (fold, str(e)))
        finally:
            fold += 1

    print ("Total (%d): Train-set Likelihood: %12.3f \t Train-set Perplexity: %12.3f" % (folds_finished, train_like_sum, perplexity_from_like(train_like_sum, train_wcount_sum)))
    print ("Total (%d): Query-set Likelihood: %12.3f \t Query-set Perplexity: %12.3f" % (folds_finished, query_like_sum, perplexity_from_like(query_like_sum, query_wcount_sum)))

    return model_files