Ejemplo n.º 1
0
def min_link_probs(model, train_tops, query_tops, links, docSubset=None):
    '''
    For every document, for each of the given links, determine the
    probability of the least likely link (i.e the document-specific
    minimum of probabilities).

    :param model: the model object
    :param train_tops: the representations of the link-target documents
    :param query_tops: the representations of the link-origin documents
    :param links: a DxD matrix of links for each document (row)
    :param docSubset: a list of documents to consider for evaluation. If
    none all documents are considered.
    :return: a vector with the minimum out-link probabilities for each
        document in the subset
    '''
    scale    = model.scale
    src_tops = lda.topicDists(query_tops.ldaQuery)
    dst_offs = train_tops.offsetTopicDists

    if docSubset is None:
        docSubset = [q for q in range(src_tops.shape[0])]
    Q = len(docSubset)

    mins = np.empty((Q,), dtype=model.dtype)
    outRow = -1
    for src in docSubset:
        outRow += 1
        probs = []
        for i in range(len(links[src,:].indices)): # For each query link-target doc
            dst = links[src,:].indices[i]          # which we denote dst
            linkProb = scale * src_tops[src,:].dot(dst_offs[dst,:])
            probs.append(linkProb)
        mins[outRow] = min(probs) if len(probs) > 0 else -1

    return mins
Ejemplo n.º 2
0
def train (data, model, query, trainPlan, isQuery=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    :param data: the dataset, must contain both words and links
    :param model: the actual model, which is modified in-place
    :param query: the query results - essentially all the "local" variables
            matched to the given observations
    :param trainPlan: how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    An new modelstate and a new querystate object with the learnt parameters,
    and and a tuple of iteration, vb-bound measurement and log-likelhood
    measurement
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    ldaModel, method, K, dtype, modelName = \
        model.ldaModel, model.method, model.K, model.dtype, model.name
    ldaTopics = query.ldaTopics

    D, K = data.doc_count, ldaModel.K

    # Step 1: Learn the topics using vanilla LDA
    if method == TF_IDF:
        # First do TF
        docLens = np.squeeze(np.array(data.words.sum(axis=1)))
        reps = data.words.copy()
        #reps /= docLens[:, np.newaxis] replaced with line below to retain sparsity
        reps = ssp.diags(np.reciprocal(docLens), 0).dot(reps)

        occ  = data.words.astype(np.bool).astype(dtype)
        docCount = np.squeeze(np.array(occ.sum(axis=0)))
        docCount += 1
        idf = np.log(D / docCount)

        # reps *= idf[np.newaxis, :]
        reps = reps.dot(ssp.diags(idf, 0))
    elif method == LDA:
        plan = lda.newTrainPlan(iterations, logFrequency=logFrequency, debug=debug)
        if isQuery:
            _, ldaTopics = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), plan)
        elif ldaTopics is None or not ldaTopics.processed:
            ldaModel, ldaTopics, (_, _, _) = lda.train(data, ldaModel, lda.newQueryState(data, ldaModel), plan)
        reps = np.sqrt(lda.topicDists(ldaTopics))
    else:
        raise ValueError("Unknown method %s" % method)


    return ModelState(ldaModel, K, method, dtype, modelName), \
           QueryState(reps, ldaTopics), \
           ([0], [0], [0])
Ejemplo n.º 3
0
def link_probs(model, train_tops, query_tops, min_link_probs, docSubset=None):
    '''
    Generate the probability of a link for all possible pairs of documents,
    but only store those probabilities that are bigger than or equal to the
    minimum. This ensures, hopefully, that we don't materialise a complete
    DxD matrix, but rather the minimum needed to determine the mean
    average precisions

    :param model: the trained model
    :param train_tops: the representations of the link-target documents
    :param query_tops: the representations of the link-origin documents
    :param min_link_probs: the minimum link probability for each document
    in the subset
    :param docSubset: a list of documents to consider for evaluation. If
    none all documents are considered.
    :return: a (hopefully) sparse len(docSubset)xD matrix of link probabilities
    '''
    scale    = model.scale
    src_tops = lda.topicDists(query_tops.ldaQuery)
    dst_offs = train_tops.offsetTopicDists

    # Determine the size of the output
    D = dst_offs.shape[0]
    if docSubset is None:
        docSubset = [q for q in range(src_tops.shape[0])]
    Q = len(docSubset)

    # We build the result up as a COO matrix
    rows = []
    cols = []
    vals = []

    # Infer the link probabilities
    outRow = -1
    for src in docSubset:
        outRow    += 1
        probs      = scale * dst_offs.dot(src_tops[src,:])
        relevant   = np.where(probs >= min_link_probs[outRow] - 1E-9)[0]

        rows.extend([outRow] * len(relevant))
        cols.extend(relevant)
        vals.extend(probs[relevant])

    # Build the COO matrix, then covert it to CSR. Converts lists to numpy
    # arrays to ensure appropriate dtypes
    r = np.array(rows, dtype=np.int32)
    c = np.array(cols, dtype=np.int32)
    v = np.array(vals, dtype=model.dtype)

    return ssp.coo_matrix((v, (r, c)), shape=(Q, D)).tocsr()
Ejemplo n.º 4
0
def lda_topics(ldaQuery):
    if "numSamples" in dir(ldaQuery):
        return lda_gibbs.topicDists(ldaQuery)
    else:
        return lda_vb.topicDists(ldaQuery)
Ejemplo n.º 5
0
def train (data, model, query, trainPlan, isQuery=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    :param data: the dataset, must contain both words and links
    :param model: the actual model, which is modified in-place
    :param query: the query results - essentially all the "local" variables
            matched to the given observations
    :param trainPlan: how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    An new modelstate and a new querystate object with the learnt parameters,
    and and a tuple of iteration, vb-bound measurement and log-likelhood
    measurement
    '''
    ldaPlan, iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.ldaPlan, trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    ldaModel, noiseVar, predVar, scale, dtype = \
        model.ldaModel, model.noiseVar, model.predVar, model.scale, model.dtype
    ldaQuery, offsetTopicDists = \
        query.ldaQuery, query.offsetTopicDists

    D, K = data.doc_count, ldaModel.K
    epsilon = 0.01 * D * K if epsilon is None else epsilon
    tau = [predVar[0], predVar[1]]

    # Step 1: Learn the topics using vanilla LDA
    print (time.strftime('%X') + " Beginning Topic Inference")
    if isQuery:
        _, ldaQuery = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), ldaPlan)
    elif not ldaModel.processed:
        ldaModel, ldaQuery, (_, _, _) = lda.train(data, ldaModel, ldaQuery, ldaPlan)
    print (time.strftime('%X') + " Topic Inference Completed")

    tops = lda.topicDists(ldaQuery)
    offs = tops.copy()
    topsSum = tops.T.dot(tops)

    # Step 2: reverse the links matrix so we can talk about the origin (not target) of links
    inlinks = data.links.T.tocsr()

    # Step 3: Learn the scaling factor and offsets for each link's target-doc till converged
    print ("Learning Offsets")
    for itr in range(iterations):
        print ("Iteration " + str(itr), end=": ")

        # Record the current scale of the offsets
        before = la.norm(offs / scale)

        # Update the scale
        lhs, rhs = 0, 0
        for p in range(data.doc_count):
            lhs += (tau[1] - tau[0]) * (tops[inlinks[p,:].indices,:].dot(offs[p,:]) ** 2).sum()
            lhs += tau[0] * (offs[p,:].dot(topsSum).dot(offs[p,:]) - offs[p,:].dot(np.outer(tops[p,:],tops[p,:])).dot(offs[p,:]))
            rhs += tau[1] * tops[inlinks[p,:].indices,:].dot(offs[p,:]).sum()

        scale = rhs / lhs

        # Update the offset for every target doc
        for p in range(data.doc_count):
            lhs  = (tau[1] - tau[0]) * np.einsum("dj,k->jk", tops[inlinks[p,:].indices,:], tops[p,:])
            lhs += tau[0] * (np.einsum("dj,k->jk", tops, tops[p,:]) - np.outer(tops[p,:], tops[p,:]))
            lhs *= (scale * scale)
            lhs[np.diag_indices_from(lhs)] += noiseVar

            rhs  = tops[p,:] + scale * tau[1] * tops[inlinks[p,:].indices,:].sum(axis=0)

            offs[p,:] = la.inv(lhs).dot(rhs)

        # Check has the offsets changed significantly
        after = la.norm(offs / scale)
        print ("%f --> %f. scale=%f" % (before, after, scale))
        if abs(before - after) < epsilon:
            break

    return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME), \
           QueryState(ldaQuery, offs), \
           ([0], [0], [0])