def min_link_probs(model, train_tops, query_tops, links, docSubset=None): ''' For every document, for each of the given links, determine the probability of the least likely link (i.e the document-specific minimum of probabilities). :param model: the model object :param train_tops: the representations of the link-target documents :param query_tops: the representations of the link-origin documents :param links: a DxD matrix of links for each document (row) :param docSubset: a list of documents to consider for evaluation. If none all documents are considered. :return: a vector with the minimum out-link probabilities for each document in the subset ''' scale = model.scale src_tops = lda.topicDists(query_tops.ldaQuery) dst_offs = train_tops.offsetTopicDists if docSubset is None: docSubset = [q for q in range(src_tops.shape[0])] Q = len(docSubset) mins = np.empty((Q,), dtype=model.dtype) outRow = -1 for src in docSubset: outRow += 1 probs = [] for i in range(len(links[src,:].indices)): # For each query link-target doc dst = links[src,:].indices[i] # which we denote dst linkProb = scale * src_tops[src,:].dot(dst_offs[dst,:]) probs.append(linkProb) mins[outRow] = min(probs) if len(probs) > 0 else -1 return mins
def train (data, model, query, trainPlan, isQuery=False): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: :param data: the dataset, must contain both words and links :param model: the actual model, which is modified in-place :param query: the query results - essentially all the "local" variables matched to the given observations :param trainPlan: how to execute the training process (e.g. iterations, log-interval etc.) Return: An new modelstate and a new querystate object with the learnt parameters, and and a tuple of iteration, vb-bound measurement and log-likelhood measurement ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug ldaModel, method, K, dtype, modelName = \ model.ldaModel, model.method, model.K, model.dtype, model.name ldaTopics = query.ldaTopics D, K = data.doc_count, ldaModel.K # Step 1: Learn the topics using vanilla LDA if method == TF_IDF: # First do TF docLens = np.squeeze(np.array(data.words.sum(axis=1))) reps = data.words.copy() #reps /= docLens[:, np.newaxis] replaced with line below to retain sparsity reps = ssp.diags(np.reciprocal(docLens), 0).dot(reps) occ = data.words.astype(np.bool).astype(dtype) docCount = np.squeeze(np.array(occ.sum(axis=0))) docCount += 1 idf = np.log(D / docCount) # reps *= idf[np.newaxis, :] reps = reps.dot(ssp.diags(idf, 0)) elif method == LDA: plan = lda.newTrainPlan(iterations, logFrequency=logFrequency, debug=debug) if isQuery: _, ldaTopics = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), plan) elif ldaTopics is None or not ldaTopics.processed: ldaModel, ldaTopics, (_, _, _) = lda.train(data, ldaModel, lda.newQueryState(data, ldaModel), plan) reps = np.sqrt(lda.topicDists(ldaTopics)) else: raise ValueError("Unknown method %s" % method) return ModelState(ldaModel, K, method, dtype, modelName), \ QueryState(reps, ldaTopics), \ ([0], [0], [0])
def link_probs(model, train_tops, query_tops, min_link_probs, docSubset=None): ''' Generate the probability of a link for all possible pairs of documents, but only store those probabilities that are bigger than or equal to the minimum. This ensures, hopefully, that we don't materialise a complete DxD matrix, but rather the minimum needed to determine the mean average precisions :param model: the trained model :param train_tops: the representations of the link-target documents :param query_tops: the representations of the link-origin documents :param min_link_probs: the minimum link probability for each document in the subset :param docSubset: a list of documents to consider for evaluation. If none all documents are considered. :return: a (hopefully) sparse len(docSubset)xD matrix of link probabilities ''' scale = model.scale src_tops = lda.topicDists(query_tops.ldaQuery) dst_offs = train_tops.offsetTopicDists # Determine the size of the output D = dst_offs.shape[0] if docSubset is None: docSubset = [q for q in range(src_tops.shape[0])] Q = len(docSubset) # We build the result up as a COO matrix rows = [] cols = [] vals = [] # Infer the link probabilities outRow = -1 for src in docSubset: outRow += 1 probs = scale * dst_offs.dot(src_tops[src,:]) relevant = np.where(probs >= min_link_probs[outRow] - 1E-9)[0] rows.extend([outRow] * len(relevant)) cols.extend(relevant) vals.extend(probs[relevant]) # Build the COO matrix, then covert it to CSR. Converts lists to numpy # arrays to ensure appropriate dtypes r = np.array(rows, dtype=np.int32) c = np.array(cols, dtype=np.int32) v = np.array(vals, dtype=model.dtype) return ssp.coo_matrix((v, (r, c)), shape=(Q, D)).tocsr()
def lda_topics(ldaQuery): if "numSamples" in dir(ldaQuery): return lda_gibbs.topicDists(ldaQuery) else: return lda_vb.topicDists(ldaQuery)
def train (data, model, query, trainPlan, isQuery=False): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: :param data: the dataset, must contain both words and links :param model: the actual model, which is modified in-place :param query: the query results - essentially all the "local" variables matched to the given observations :param trainPlan: how to execute the training process (e.g. iterations, log-interval etc.) Return: An new modelstate and a new querystate object with the learnt parameters, and and a tuple of iteration, vb-bound measurement and log-likelhood measurement ''' ldaPlan, iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.ldaPlan, trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug ldaModel, noiseVar, predVar, scale, dtype = \ model.ldaModel, model.noiseVar, model.predVar, model.scale, model.dtype ldaQuery, offsetTopicDists = \ query.ldaQuery, query.offsetTopicDists D, K = data.doc_count, ldaModel.K epsilon = 0.01 * D * K if epsilon is None else epsilon tau = [predVar[0], predVar[1]] # Step 1: Learn the topics using vanilla LDA print (time.strftime('%X') + " Beginning Topic Inference") if isQuery: _, ldaQuery = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), ldaPlan) elif not ldaModel.processed: ldaModel, ldaQuery, (_, _, _) = lda.train(data, ldaModel, ldaQuery, ldaPlan) print (time.strftime('%X') + " Topic Inference Completed") tops = lda.topicDists(ldaQuery) offs = tops.copy() topsSum = tops.T.dot(tops) # Step 2: reverse the links matrix so we can talk about the origin (not target) of links inlinks = data.links.T.tocsr() # Step 3: Learn the scaling factor and offsets for each link's target-doc till converged print ("Learning Offsets") for itr in range(iterations): print ("Iteration " + str(itr), end=": ") # Record the current scale of the offsets before = la.norm(offs / scale) # Update the scale lhs, rhs = 0, 0 for p in range(data.doc_count): lhs += (tau[1] - tau[0]) * (tops[inlinks[p,:].indices,:].dot(offs[p,:]) ** 2).sum() lhs += tau[0] * (offs[p,:].dot(topsSum).dot(offs[p,:]) - offs[p,:].dot(np.outer(tops[p,:],tops[p,:])).dot(offs[p,:])) rhs += tau[1] * tops[inlinks[p,:].indices,:].dot(offs[p,:]).sum() scale = rhs / lhs # Update the offset for every target doc for p in range(data.doc_count): lhs = (tau[1] - tau[0]) * np.einsum("dj,k->jk", tops[inlinks[p,:].indices,:], tops[p,:]) lhs += tau[0] * (np.einsum("dj,k->jk", tops, tops[p,:]) - np.outer(tops[p,:], tops[p,:])) lhs *= (scale * scale) lhs[np.diag_indices_from(lhs)] += noiseVar rhs = tops[p,:] + scale * tau[1] * tops[inlinks[p,:].indices,:].sum(axis=0) offs[p,:] = la.inv(lhs).dot(rhs) # Check has the offsets changed significantly after = la.norm(offs / scale) print ("%f --> %f. scale=%f" % (before, after, scale)) if abs(before - after) < epsilon: break return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME), \ QueryState(ldaQuery, offs), \ ([0], [0], [0])