Example #1
0
def newModelAtRandom(data, K, topicPrior=None, vocabPrior=None, dtype=DTYPE):
    '''
    Creates a new LDA ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)
    
    Param:
    data - the dataset of words, features and links of which only words are used in this model
    K - the number of topics
    topicPrior - the prior over topics, either a scalar or a K-dimensional vector
    vocabPrior - the prior over vocabs, either a scalar or a T-dimensional vector
    dtype      - the datatype to be used throughout.
    
    Return:
    A ModelState object
    '''
    assert K > 1, "There must be at least two topics"
    T = data.words.shape[1]
    
    if topicPrior is None:
        topicPrior = constantArray((K,), 50.0 / K, dtype) # From Griffiths and Steyvers 2004
    if vocabPrior is None:
        vocabPrior = 1.1 # Also from G&S
    
    vocabPriorVec = constantArray((T,), vocabPrior, dtype)
    wordDists = rd.dirichlet(vocabPriorVec, size=K).astype(dtype)
    
    # Peturb to avoid zero probabilities
    wordDists += 1./T
    wordDists /= (wordDists.sum(axis=1))[:,np.newaxis]
    
    return ModelState(K, topicPrior, vocabPrior, wordDists, dtype, MODEL_NAME)
Example #2
0
def newModelAtRandom(data, K, topicPrior=None, vocabPrior=None, dtype=DTYPE):
    '''
    Creates a new LDA ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)
    
    Param:
    data - the dataset of words, features and links of which only words are used in this model
    K - the number of topics
    topicPrior - the prior over topics, either a scalar or a K-dimensional vector
    vocabPrior - the prior over vocabs, either a scalar or a T-dimensional vector
    dtype      - the datatype to be used throughout.
    
    Return:
    A ModelState object
    '''
    assert K > 1, "There must be at least two topics"

    # Based on Griffiths and Steyvers 2004, with the amendment suggested by
    # Ascuncion in his Smoothing Topic Models paper


    if topicPrior is None:
        topicPrior = constantArray((K,), (50.0 / K) + 0.5, dtype=dtype) # From Griffiths and Steyvers 2004
    if vocabPrior is None:
        vocabPrior = 0.1 + 0.5
        
    n_dk = None # These start out at none until we actually
    n_kv = None # go ahead and train this model.
    n_k  = None
    
    return ModelState(K, topicPrior, vocabPrior, n_dk, n_kv, n_k, dtype, MODEL_NAME)
Example #3
0
def newModelAtRandom(data, K, topicPrior=None, vocabPrior=VocabPrior, dtype=DTYPE):
    '''
    Creates a new LDA ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)

    Param:
    W - the DxT document-term matrix of T terms in D documents
        which will be used for training.
    X - the DxD matrix of document-document links
    K - the number of topics
    topicPrior - the prior over topics, either a scalar or a K-dimensional vector
    vocabPrior - the prior over vocabs, either a scalar or a T-dimensional vector
    dtype      - the datatype to be used throughout.

    Return:
    A ModelState object
    '''
    assert K > 1, "There must be at least two topics"
    assert K < 255, "There can be no more than 255 topics"
    T = data.words.shape[1]

    if topicPrior is None:
        topicPrior = constantArray((K,), 5.0 / K + 0.5, dtype)  # From Griffiths and Steyvers 2004
    elif type(topicPrior) is float:
        topicPrior = constantArray((K,), topicPrior, dtype)  # From Griffiths and Steyvers 2004
    if vocabPrior is None:
        vocabPrior = VocabPrior

    wordDists = np.ones((K, T), dtype=dtype)
    for k in range(K):
        docLenSum = 0
        while docLenSum < 1000:
            randomDoc = rd.randint(0, data.doc_count, size=1)
            sample_doc = data.words[randomDoc, :]
            wordDists[k, sample_doc.indices] += sample_doc.data
            docLenSum += sample_doc.sum()
        wordDists[k, :] /= wordDists[k, :].sum()

    corpusTopicDist = np.array([1./K] * K, dtype=dtype)

    return ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, False, dtype, MODEL_NAME)
Example #4
0
def newModelAtRandom(data, K, topicPrior=None, vocabPrior=None, dtype=DTYPE):
    '''
    Creates a new LDA ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)
    
    Param:
    data - the dataset of words, features and links of which only words are used in this model
    K - the number of topics
    topicPrior - the prior over topics, either a scalar or a K-dimensional vector
    vocabPrior - the prior over vocabs, either a scalar or a T-dimensional vector
    dtype      - the datatype to be used throughout.
    
    Return:
    A ModelState object
    '''
    T = data.words.shape[1]
    
    assert K > 1,     "There must be at least two topics"
    assert K < 256,   "There can be no more than 256 topics"
    assert T < 65536, "There can be no more than 65,536 unique words"
    
    if topicPrior is None:
        topicPrior = constantArray((K,), 50.0 / K, dtype=dtype) # From Griffiths and Steyvers 2004
    if type(topicPrior) == float or type(topicPrior) == int:
        topicPrior = constantArray((K,), topicPrior, dtype=dtype)
    if vocabPrior is None:
        vocabPrior = constantArray((T,), 0.1, dtype=dtype) # Also from G&S
    elif type(vocabPrior) is float:
        vocabPrior = constantArray((T,), vocabPrior, dtype=dtype) # Also from G&S
        
    topicSum  = None # These start out at none until we actually
    vocabSum  = None # go ahead and train this model.
    numSamples = 0
    
    return ModelState(K, T, topicPrior, vocabPrior, topicSum, vocabSum, numSamples, False, dtype, MODEL_NAME)
Example #5
0
def var_bound(data, modelState, queryState):
    '''
    Determines the variational bounds. Values are mutated in place, but are
    reset afterwards to their initial values. So it's safe to call in a serial
    manner.
    '''
    # Unpack the the structs, for ease of access and efficiency
    W     = data.words
    D,T   = W.shape
    K     = modelState.K
    n_kt  = modelState.n_kt
    n_dk  = queryState.n_dk
    n_k   = queryState.n_k
    z_dnk = queryState.z_dnk
    a     = modelState.topicPrior
    b     = modelState.vocabPrior
    
    docLens = queryState.docLens
    
    bound = 0

    if type(a) is float or np.isscalar(a):
        a = constantArray((modelState.K,), a, modelState.dtype)
    
    # Expected value of the p(W,Z). Note everything else marginalized out, and
    # we're using a 0-th order Taylor expansion.
    try:
        bound += D * (fns.gammaln(a.sum()) - fns.gammaln(a).sum())
        bound += K * (fns.gammaln(T * b) - T * fns.gammaln(b))
    
        bound -= np.sum (fns.gammaln(a.sum() + docLens))
        bound += np.sum (fns.gammaln(a[np.newaxis,:] + n_dk))
    
        bound -= np.sum (fns.gammaln(T * b + n_k))
        bound += np.sum (fns.gammaln(b + n_kt))
    
        # The entropy of z_dnk. Have to do this in a loop as z_dnk is
        # is jagged in it's third dimension.
        if modelState.dtype == np.float32:
            bound -= compiled.jagged_entropy_f32 (z_dnk, docLens)
        elif modelState.dtype == np.float64:
            bound -= compiled.jagged_entropy_f64 (z_dnk, docLens)
        else:
            raise ValueError ("No implementation defined for dtype " + str(modelState.dtype))
    except OverflowError:
        print("Overflow error encountered, returning zero")
        return 0
    
    return bound
Example #6
0
def _step_sizes(last_step, t, g, gg, log_likely, train_plan):
    """
    Evaluates the step size according to the metric specified by rate_algor. Do a separate
    stepSize for each parameter.
    :param last_step: The last values of step sizes
    :param t: the batch indicator
    :param g: the estimate of the expected gradient
    :param gg: the estimate of the expected inner product of the gradients
    :param log_likely: the log-likelihood of the dataset
    :param train_plan: the training plan, specifies the training dataset.
    """
    K, dtype = len(last_step), last_step.dtype

    if train_plan.rate_algor == RateAlgorTimeKappa:
        return constantArray(K, (t + train_plan.rate_delay) ** (-train_plan.forgetting_rate), dtype)
    elif train_plan.rate_algor == RateAlgorAmaria:
        s = last_step[0] * exp(train_plan.rate_a * (train_plan.rate_b * -log_likely - last_step[0]))
        return constantArray(K, s, dtype)
    elif train_plan.rate_algor == RateAlgorVariance:
        s = g.dot(g)
        s[:, np.newaxis] /= gg
        return s
    else:
        raise ValueError ("Unknown rate algorithm " + str(train_plan.rate_algor))
Example #7
0
def newModelAtRandom(data, K, pseudoNegCount=None, regularizer=0.001, topicPrior=None, vocabPrior=None, dtype=DTYPE):
    '''
    Creates a new LDA ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)

    Param:
    W - the DxT document-term matrix of T terms in D documents
        which will be used for training.
    X - the DxD matrix of document-document links
    K - the number of topics
    psuedoNegCount - since we train only on positive examples, this is the
    count of negative examples to "invent" for our problem
    regularizer - the usual ridge regression coefficient
    topicPrior - the prior over topics, either a scalar or a K-dimensional vector
    vocabPrior - the prior over vocabs, either a scalar or a T-dimensional vector
    dtype      - the datatype to be used throughout.

    Return:
    A ModelState object
    '''
    assert K > 1,   "There must be at least two topics"
    assert K < 255, "There can be no more than 255 topics"
    T = data.words.shape[1]

    if topicPrior is None:
        topicPrior = constantArray((K + 1,), 5.0 / K + 0.5, dtype) # From Griffiths and Steyvers 2004
    if vocabPrior is None:
        vocabPrior = 0.1 + 0.5 # Also from G&S

    topicPrior[K] = 0

    wordDists = np.ones((K,T), dtype=dtype)
    doc_ids = rd.randint(0, data.doc_count, size=K)
    for k in range(K):
        sample_doc = data.words[doc_ids[k], :]
        wordDists[k, sample_doc.indices] += sample_doc.data

    # The weight vector
    weights = np.ones((K + 1,))

    # Count of dummy negative observations. Assume that for every
    # twp papers cited, 1 was considered and discarded
    if pseudoNegCount is None:
        pseudoNegCount = data.doc_count * 0.5 * np.mean(data.links.sum(axis=1).astype(DTYPE))

    return ModelState(K, topicPrior, vocabPrior, wordDists, weights, pseudoNegCount, regularizer, dtype, MODEL_NAME)
Example #8
0
def newModelAtRandom(data, K, Q, topicPrior=None, vocabPrior=None, dtype=DTYPE):
    '''
    Creates a new model using a random initialisation of the given parameters

    :param data: the data to be used in training, should contain words and links
    :param K:  the number of topics to infer
    :param Q:  the number of latent document groups
    :param topicPrior:  the prior over topics, either a scalar oa K-dim vector
    :param vocabPrior:  the prior over words, a scalar
    :param U: the topic matrix is decomposed as topics = V.dot(U.T). Hence U is a Q x K
    matrix
    :param V:  the topic matrix is decomposed as topics = V.dot(U.T). Hence V is a Q x D
    matrix
    :param dtype: the data type used for all fields in this dataset.
    :return: a new ModelState object.
    '''
    assert K > 1,   "There must be at least two topics"
    assert K < 255, "There can be no more than 255 topics"
    assert Q < K,   "By definition the rank of the doc-covariance must be leq K, so Q < K, but you have Q=" + str(Q) + " and K=" + str(K)

    T = data.words.shape[1]

    if topicPrior is None:
        topicPrior = constantArray((K,), 50.0 / K + 0.5, dtype) # From Griffiths and Steyvers 2004
    if vocabPrior is None:
        vocabPrior = 0.1 + 0.5 # Also from G&S

    # Pick some documents at random, and let a vague distribution of their
    # words consitute our initial guess.
    wordDists = np.ones((K, T), dtype=dtype)
    doc_ids = rd.randint(0, data.doc_count, size=K)
    for k in range(K):
        sample_doc = data.words[doc_ids[k], :]
        wordDists[k, sample_doc.indices] += sample_doc.data

    # Scale up so it properly resembles something inferred from this dataset
    # (this avoids catastrophic underflow in softmax)
    wordDists *= data.word_count / K

    return ModelState(K, Q, topicPrior, vocabPrior, wordDists, Vagueness * np.eye(K), dtype=dtype, name=MODEL_NAME)
Example #9
0
def log_likelihood (data, modelState, queryState):
    '''
    Return the log-likelihood of the given data W according to the model
    and the parameters inferred for the entries in W stored in the
    queryState object.
    
    Actually returns a vector of D document specific log likelihoods
    '''
    n_dk, n_kt = queryState.n_dk, modelState.n_kt
    a, b       = modelState.topicPrior, modelState.vocabPrior

    if type(a) is float or np.isscalar(a):
        a = constantArray((modelState.K,), a, modelState.dtype)
    W = data.words if data.words.dtype is modelState.dtype \
        else data.words.astype(modelState.dtype)
   
    n_dk += a[np.newaxis,:]
    n_kt += b

    # Scale to create distributions over doc-topics and topic-vocabs
    doc_norm = n_dk.sum(axis = 1)
    voc_norm = n_kt.sum(axis = 1)
    
    n_dk /= doc_norm[:,np.newaxis]
    n_kt /= voc_norm[:,np.newaxis]
    
    # Use distributions to create log-likelihood. This could be made
    # faster still by not materializing the (admittedly sparse) matrix
    ln_likely = sparseScalarProductOfSafeLnDot(W, n_dk, n_kt).sum()
    
    # Rescale back to word-counts
    n_dk *= doc_norm[:,np.newaxis]
    n_kt *= voc_norm[:,np.newaxis]
    
    n_dk -= a[np.newaxis, :]
    n_kt -= b
    
    return ln_likely
def newModelAtRandom(data, K, topicPrior=None, vocabPrior=None, dtype=DTYPE):
    '''
    Creates a new LDA ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)

    Param:
    W - the DxT document-term matrix of T terms in D documents
        which will be used for training.
    X - the DxD matrix of document-document links
    K - the number of topics
    topicPrior - the prior over topics, either a scalar or a K-dimensional vector
    vocabPrior - the prior over vocabs, either a scalar or a T-dimensional vector
    dtype      - the datatype to be used throughout.

    Return:
    A ModelState object
    '''
    assert K > 1,   "There must be at least two topics"
    assert K < 255, "There can be no more than 255 topics"
    T = data.words.shape[1]

    if topicPrior is None:
        topicPrior = constantArray((K + 1,), 5.0 / K + 0.5, dtype) # From Griffiths and Steyvers 2004
    if vocabPrior is None:
        vocabPrior = 0.1 + 0.5 # Also from G&S

    topicPrior[K] = 0

    wordDists = np.ones((K,T), dtype=dtype)
    doc_ids = rd.randint(0, data.doc_count, size=K)
    for k in range(K):
        sample_doc = data.words[doc_ids[k], :]
        wordDists[k, sample_doc.indices] += sample_doc.data


    return ModelState(K, topicPrior, vocabPrior, wordDists, dtype, MODEL_NAME)
Example #11
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize, rateAlgor = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize, plan.rate_algor
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError("Input document-term matrix contains at least one document with no words")
    assert model.dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W = data.words
    D, T = W.shape

    iters, bnds, likes = [], [], []

    # A few parameters for handling adaptive step-sizes in SGD
    if plan.rate_algor == RateAlgorBatch:
        batchSize  = D
        batchCount = 1
    else:
        batchSize  = plan.batchSize
        batchCount = D // batchSize + 1

    gradStep = constantArray((K,), 1./float(batchSize), dtype=dtype)
    grad     = np.zeros((K,T), dtype=dtype)
    ex_grad  = grad.copy()
    exp_gtg  = np.zeros((K,), dtype=dtype)
    stepSize = np.ones((K,), dtype=dtype)

    # The digamma terms for the vocabularly
    diWordDistSums = np.empty((K,), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)


    # Amend the name to incorporate training information
    modelName = "lda/svbp/%s" % _sgd_desc(plan)
    print(modelName)

    rateAlgor = RateAlgorVariance

    # Start traininng
    d = -1
    for b in range(batchCount * iterations):
        grad.fill(vocabPrior)
        # firstD = d
        for s in range(batchSize):
            d = d + 1 if (d + 1) < D else 0

            wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums)
            grad[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

        if rateAlgor == RateAlgorBatch:
            wordDists = grad
        else:
            if rateAlgor == RateAlgorTimeKappa:
                stepSize[:] = (b + plan.rate_delay)**(-plan.forgetting_rate)
            elif rateAlgor == RateAlgorVariance:
                update_inplace_v(gradStep, ex_grad, change=grad)
                gtg = stepSize.copy()
                for k in range(K):
                    stepSize[k] = np.dot(ex_grad[k,:], ex_grad[k,:])
                    gtg[k] = np.dot(grad[k,:], grad[k,:])
                update_inplace_s(gradStep, old=exp_gtg, change=gtg)
                stepSize /= exp_gtg
                gradStep  = gradStep * (1 - stepSize) + 1
            elif rateAlgor == RateAlgorAmaria:
                topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
                # doc_indices = np.linspace(firstD, firstD + batchSize -1, batchSize) % D
                log_likely = var_bound(
                    data, # data._reorder(doc_indices),
                    ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName),
                    QueryState(docLens, topicMeans, True)
                )
                p     = stepSize[0]
                a, b  = plan.rate_a, plan.rate_b
                p    *= exp(a * (b * -log_likely - p))
                stepSize[:] = p
                topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
            else:
                raise ValueError ("No code to support the '" + str(plan.rate_algor) + "' learning rate adaptation algorithm")

            update_inplace_v (stepSize, old=wordDists, change=grad)

        print ("%s : t=%d : step=%s" % (rateAlgor, b, str(stepSize)))
        fns.digamma(wordDists, out=diWordDists)
        np.sum(wordDists, axis=1, out=diWordDistSums)
        fns.digamma(diWordDistSums, out=diWordDistSums)



    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \
           QueryState(docLens, topicMeans, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))