def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None): ''' Take a feature list. Train on all documents where none of those features are set. Remove the first element from the feature list, query all documents with that feature set, and then evaluate link prediction. Repeat until feature-list is empty. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param feature_mask: the list of features used to separate training from query This is a list of tuples, the left side is the feature label, the right side is the :param model_dir: if not none, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 while len(feature_mask) > 0: # try: # Prepare the training and query data feature_mask_indices = [i for _,i in feature_mask] train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices) (feat_label, feat_id) = feature_mask.pop(0) print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80)) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models # Train the model if model_uses_lda(sample_model): ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices) model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities query_tops = mdl.newQueryState(query_data, model) _, query_tops = mdl.query(query_data, model, query_tops, train_plan) min_link_probs = mdl.min_link_probs(model, train_tops, query_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs) expected_links = query_data.links # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") useDiagonalPriorCov = True rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True) model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, likelies, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")
def cross_val_and_eval_perplexity(data, mdl, sample_model, train_plan, query_plan, num_folds, fold_run_count=-1, model_dir= None): ''' Uses cross-validation go get the average perplexity. If folds == 1 a special path is triggered where perplexity is evaluated on the training data, and the results are not saved to disk, even if model_dir is not none :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param query_plan: the query play (number of iterations etc.) :param num_folds: the number of folds to cross validation :param fold_run_count: for debugging stop early after processing the number of the folds :param model_dir: if not none, the models are stored in this directory. :return: the list of model files stored ''' model_files = [] if fold_run_count < 1: fold_run_count = num_folds if num_folds == 1: model = mdl.newModelFromExisting(sample_model) query = mdl.newQueryState(data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = mdl.train(data, model, query, train_plan) likely = mdl.log_likelihood(data, model, train_tops) perp = perplexity_from_like(likely, data.word_count) print("Train-set Likelihood: %12f" % (likely)) print("Train-set Perplexity: %12f" % (perp)) model_files = save_if_necessary(model_files, model_dir, model, data, 0, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) return model_files query_like_sum = 0 # to calculate the overall likelihood and query_wcount_sum = 0 # perplexity for the whole dataset train_like_sum = 0 train_wcount_sum = 0 folds_finished = 0 # count of folds that finished successfully fold = 0 while fold < num_folds and folds_finished < fold_run_count: try: train_data, query_data = data.cross_valid_split(fold, num_folds) # Train the model print ("Duplicating model template... ", end="") model = mdl.newModelFromExisting(sample_model) print ("Done.\nCreating query state...") train_tops = mdl.newQueryState(train_data, model) print ("Starting training") model, train_tops, (train_itrs, train_vbs, train_likes) \ = mdl.train(train_data, model, train_tops, train_plan) train_like = mdl.log_likelihood (train_data, model, train_tops) train_word_count = train_data.word_count train_perp = perplexity_from_like(train_like, train_word_count) print ("DEBUG Train perplexity is " + str(train_perp)) # Query the model - if there are no features we need to split the text print ("Starting query.") query_estim, query_eval = query_data.doc_completion_split() query_tops = mdl.newQueryState(query_estim, model) model, query_tops = mdl.query(query_estim, model, query_tops, query_plan) query_like = mdl.log_likelihood(query_eval, model, query_tops) query_word_count = query_eval.word_count query_perp = perplexity_from_like(query_like, query_word_count) # Keep a record of the cumulative likelihood and query-set word-count train_like_sum += train_like train_wcount_sum += train_word_count query_like_sum += query_like query_wcount_sum += query_word_count folds_finished += 1 # Write out the output print("Fold %d: Train-set Perplexity: %12.3f \t Query-set Perplexity: %12.3f" % (fold, train_perp, query_perp)) print("") # Save the model model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, query_tops, mdl) # except Exception as e: # traceback.print_exc() # print("Abandoning fold %d due to the error : %s" % (fold, str(e))) finally: fold += 1 print ("Total (%d): Train-set Likelihood: %12.3f \t Train-set Perplexity: %12.3f" % (folds_finished, train_like_sum, perplexity_from_like(train_like_sum, train_wcount_sum))) print ("Total (%d): Query-set Likelihood: %12.3f \t Query-set Perplexity: %12.3f" % (folds_finished, query_like_sum, perplexity_from_like(query_like_sum, query_wcount_sum))) return model_files