def log_likelihood (data, modelState, queryState): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. ''' probs = rowwise_softmax(queryState.outMeans) doc_dist = colwise_softmax(queryState.inMeans) word_likely = np.sum( \ sparseScalarProductOfSafeLnDot(\ data.words, \ probs, \ modelState.vocab \ ).data \ ) link_likely = np.sum( \ sparseScalarProductOfSafeLnDot(\ data.links, \ probs, \ doc_dist \ ).data \ ) return word_likely + link_likely
def link_probs(model, train_tops, query_tops, min_link_probs, docSubset=None): ''' Generate the probability of a link for all possible pairs of documents, but only store those probabilities that are bigger than or equal to the minimum. This ensures, hopefully, that we don't materialise a complete DxD matrix, but rather the minimum needed to determine the mean average precisions :param model: the trained model :param train_tops: the representations of the link-target documents :param query_tops: the representations of the link-origin documents :param min_link_probs: the minimum link probability for each document in the subset :param docSubset: a list of documents to consider for evaluation. If none all documents are considered. :return: a (hopefully) sparse len(docSubset)xD matrix of link probabilities ''' # We build the result up as a COO matrix rows = [] cols = [] vals = [] # Determine the size of the output D = train_tops.outMeans.shape[0] if docSubset is None: docSubset = [q for q in range(query_tops.outMeans.shape[0])] Q = len(docSubset) # Calculate the softmax transform parameters dstTopDists = colwise_softmax(train_tops.inMeans) # Infer the link probabilities outRow = -1 for src in docSubset: outRow += 1 srcTopDist = softmax(query_tops.outMeans[src, :]) probs = dstTopDists.dot(srcTopDist) relevant = np.where(probs >= min_link_probs[outRow] - 1E-9)[0] rows.extend([outRow] * len(relevant)) cols.extend(relevant) vals.extend(probs[relevant]) # Build the COO matrix, then covert it to CSR. Converts lists to numpy # arrays to ensure appropriate dtypes r = np.array(rows, dtype=np.int32) c = np.array(cols, dtype=np.int32) v = np.array(vals, dtype=model.dtype) return ssp.coo_matrix((v, (r, c)), shape=(Q, D)).tocsr()
def link_probs(model, topics, min_link_probs): ''' Generate the probability of a link for all possible pairs of documents, but only store those probabilities that are bigger than or equal to the minimum. This ensures, hopefully, that we don't materialise a complete DxD matrix, but rather the minimum needed to determine the mean average precisions :param model: the trained model :param topics: the topics for each of the documents we're generating links for :param min_link_probs: the minimum link probability for each document :return: a (hopefully) sparse DxD matrix of link probabilities ''' # We build the result up as a COO matrix rows = [] cols = [] vals = [] # Calculate the softmax transform parameters D = topics.means.shape[0] linkDist = colwise_softmax(topics.means) # Infer the link probabilities for d in range(D): if d == 2935: print("Ruh-ro") topDistAtD = softmax(topics.means[d, :]) probs = linkDist.dot(topDistAtD) relevant = np.where(probs >= min_link_probs[d] - 1E-9)[0] rows.extend([d] * len(relevant)) cols.extend(relevant) vals.extend(probs[relevant]) # Build the COO matrix, then covert it to CSR. Converts lists to numpy # arrays to ensure appropriate dtypes r = np.array(rows, dtype=np.int32) c = np.array(cols, dtype=np.int32) v = np.array(vals, dtype=model.dtype) return ssp.coo_matrix((v, (r, c)), shape=(D, D)).tocsr()