def testOnRealData(self): print ("CTM/Bouchard") rd.seed(0xBADB055) path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", 'rb') as f: _, W, _, d = pkl.load(f) if len(d) == 1: d = d[0] if W.dtype != DTYPE: W = W.astype(DTYPE) docLens = np.squeeze(np.asarray(W.sum(axis=1))) good_rows = (np.where(docLens > 0.5))[0] if len(good_rows) < W.shape[0]: print ("Some rows in the doc-term matrix are empty. These have been removed.") W = W[good_rows, :] # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 20 model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan) with open(newModelFileFromModel(model), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() fig.suptitle("CTM/Bouchard (Identity Cov) on NIPS") plt.show() plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 100 kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
def link_split_map (data, mdl, sample_model, train_plan, folds, model_dir = None): ''' Train on all the words and half the links. Predict the remaining links. Evaluate using mean average-precision. Cross validation may be used, but note we're always evaluating on training data. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param model_dir: if not none, and folds > 1, the models are stored in this directory. :return: the list of model files stored ''' model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data for fold in range(folds): model = mdl.newModelFromExisting(sample_model) train_data, query_data = data.link_prediction_split(symmetric=False) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) min_link_probs = mdl.min_link_probs(model, train_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs) map = mean_average_prec (query_data.links, predicted_link_probs) print ("Fold %2d: Mean-Average-Precision %6.3f" % (fold, map)) model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) return model_files
def newQueryState(data, modelState): ''' Creates a new CTM Query state object. This contains all parameters and random variables tied to individual datapoints. Param: data - the dataset of words, features and links of which only words and features are used in this model modelState - the model state object REturn: A QueryState object ''' base = ctm.newQueryState(data, modelState) return QueryState(base.means, base.expMeans, base.varcs, base.lxi, base.s, base.docLens)
def _testOnModelHandcraftedData(self): # # Create the vocab # T = 3 * 3 K = 5 # Horizontal bars vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense() #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense() vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense() # Vertical bars vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense() #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense() vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense() # Diagonals vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense() #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense() # Put together T = vocab1.shape[0] * vocab1.shape[1] vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7] # Create a single matrix with the flattened vocabularies vocabVectors = [] for vocab in vocabs: vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T))))) vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE)) # Plot the vocab ones = np.ones(vocabs[0].shape) for k in range(K): plt.subplot(2, 3, k) plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r) plt.show() # # Create the corpus # rd.seed(0xC0FFEE) D = 1000 # Make sense (of a sort) of this by assuming that these correspond to # Kittens Omelettes Puppies Oranges Tomatoes Dutch People Basketball Football #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25]) # topicCovar = np.array(\ # [[ 100, 5, 55, 20, 5, 15, 4, 0], \ # [ 5, 100, 5, 10, 70, 5, 0, 0], \ # [ 55, 5, 100, 5, 5, 10, 0, 5], \ # [ 20, 10, 5, 100, 30, 30, 20, 10], \ # [ 5, 70, 5, 30, 100, 0, 0, 0], \ # [ 15, 5, 10, 30, 0, 100, 10, 40], \ # [ 4, 0, 0, 20, 0, 10, 100, 20], \ # [ 0, 0, 5, 10, 0, 40, 20, 100]], dtype=DTYPE) / 100.0 topicMean = np.array([25, 15, 40, 5, 15]) self.assertEqual(100, topicMean.sum()) topicCovar = np.array(\ [[ 100, 5, 55, 20, 5 ], \ [ 5, 100, 5, 10, 70 ], \ [ 55, 5, 100, 5, 5 ], \ [ 20, 10, 5, 100, 30 ], \ [ 5, 70, 5, 30, 100 ], \ ], dtype=DTYPE) / 100.0 meanWordCount = 80 wordCounts = rd.poisson(meanWordCount, size=D) topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D) W = topicDists.dot(vocab) * wordCounts[:, np.newaxis] W = ssp.csr_matrix (W.astype(DTYPE)) # # Train the model # model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=65, logFrequency=1) self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01) return self._doTest (W, model, queryState, trainPlan)
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") useDiagonalPriorCov = True rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True) model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, likelies, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None): ''' Take a feature list. Train on all documents where none of those features are set. Remove the first element from the feature list, query all documents with that feature set, and then evaluate link prediction. Repeat until feature-list is empty. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param feature_mask: the list of features used to separate training from query This is a list of tuples, the left side is the feature label, the right side is the :param model_dir: if not none, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 while len(feature_mask) > 0: # try: # Prepare the training and query data feature_mask_indices = [i for _,i in feature_mask] train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices) (feat_label, feat_id) = feature_mask.pop(0) print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80)) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models # Train the model if model_uses_lda(sample_model): ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices) model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities query_tops = mdl.newQueryState(query_data, model) _, query_tops = mdl.query(query_data, model, query_tops, train_plan) min_link_probs = mdl.min_link_probs(model, train_tops, query_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs) expected_links = query_data.links # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def insample_lro_style_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None): ''' For documents with > 5 links remove a portion. The portion is determined by the number of folds (e.g. five-fold implied remove one fifth of links, three fold implies remove a third, etc.) Train on all documents and all remaining links. Predict remaining links. Evaluate using precision@m, recall@m, mean reciprocal-rank and mean average-precision Average all results. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param target_folds: the number of folds to complete before finishing. Set to folds by default :param model_dir: if not none, and folds > 1, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data if target_folds is None: target_folds = folds combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 fold_count = 0 for fold in range(folds): # try: # Prepare the training and query data train_data, query_data, docSubset = data.folded_link_prediction_split(MinLinkCountEval, fold, folds) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models print ("\n\nFold %d\n" % fold + ("-" * 80)) # Train the model if model_uses_lda(sample_model): model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopics) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities min_link_probs = mdl.min_link_probs(model, train_tops, train_tops, query_data.links, docSubset) predicted_link_probs = mdl.link_probs(model, train_tops, train_tops, min_link_probs, docSubset) expected_links = query_data.links[docSubset, :] # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print ("Fold %2d: Mean-Precisions at \n" % fold, end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) fold_count += 1 if fold_count == target_folds: break # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def link_split_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None): ''' Train on all the words and half the links. Predict the remaining links. Evaluate using precision at m using as values of m 50, 100, 250, and 500, and additionally recall at m Cross validation may be used, but note we're always evaluating on training data. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param target_folds: the number of folds to complete before finishing. Set to folds by default :param model_dir: if not none, and folds > 1, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data if ldaModel is not None: (_, _, _, _, ldaModel, ldaTopics, _) = ldaModel if target_folds is None: target_folds = folds combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 for fold in range(target_folds): model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) \ if sample_model.name == LRO_MODEL_NAME \ else mdl.newModelFromExisting(sample_model) train_data, query_data = data.link_prediction_split(symmetric=False) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) min_link_probs = mdl.min_link_probs(model, train_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs) expected_links = query_data.links precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print ("Fold %2d: Mean-Precisions at \n" % fold, end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) return model_files
def cross_val_and_eval_hashtag_prec_at_m(data, mdl, sample_model, train_plan, word_dict, num_folds, fold_run_count=-1, model_dir= None): ''' Evaluate the precision at M for the top 50 hash-tags. In the held-out set, the hashtags are deleted. We train on all, both training and held-out, then evaluate the precision at M for the hashtags For values of M we use 10, 50, 100, 150, 250, 500 :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param word_dict the word dictionary, used to identify hashtags and print them out when the run is completed. :param num_folds: the number of folds to cross validation :param fold_run_count: for debugging stop early after processing the number of the folds :param model_dir: if not none, the models are stored in this directory. :return: the list of model files stored ''' MS = [10, 50, 100, 150, 200, 250, 1000, 1500, 3000, 5000, 10000] Precision, Recall = "precision", "recall" model_files = [] if fold_run_count < 1: fold_run_count = num_folds if num_folds <= 1: raise ValueError ("Number of folds must be greater than 1") hashtag_indices = popular_hashtag_indices (data, word_dict, 50) folds_finished = 0 # count of folds that finished successfully fold = 0 while fold < num_folds and folds_finished < fold_run_count: try: train_range, query_range = data.cross_valid_split_indices(fold, num_folds) segment_with_htags = data.words[train_range, :] held_out_segment_with_htags = data.words[query_range, :] held_out_segment_without_htags = data.words[query_range, :] held_out_segment_without_htags[:, hashtag_indices] = 0 train_words = ssp.vstack((segment_with_htags, held_out_segment_without_htags)) train_data = data.copy_with_changes(words=train_words) # Train the model print ("Duplicating model template... ", end="") model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) print ("Starting training") model, train_tops, (train_itrs, train_vbs, train_likes) \ = mdl.train(train_data, model, train_tops, train_plan) # Predict hashtags dist = rowwise_softmax(train_tops.means) # For each hash-tag, for each value of M, evaluate the precision results = {Recall : dict(), Precision : dict()} for hi in hashtag_indices: h_probs = dist[query_range,:].dot(model.vocab[:,hi]) h_count = held_out_segment_with_htags[:, hi].sum() results[Recall][word_dict[hi]] = { -1 : h_count } results[Precision][word_dict[hi]] = { -1 : h_count } for m in MS: top_m = h_probs.argsort()[-m:][::-1] true_pos = held_out_segment_with_htags[top_m, hi].sum() rec_denom = min(m, h_count) results[Precision][word_dict[hi]][m] = true_pos / m results[Recall][word_dict[hi]][m] = true_pos / rec_denom print ("%10s\t%20s\t%6s\t" % ("Metric", "Hashtag", "Count") + "\t".join("%5d" % m for m in MS)) for htag, prec_results in results[Precision].items(): print ("%10s\t%20s\t%6d\t%s" % ("Precision", htag, prec_results[-1], "\t".join(("%0.3f" % prec_results[m] for m in MS)))) for htag, prec_results in results[Recall].items(): print ("%10s\t%20s\t%6d\t%s" % ("Recall", htag, prec_results[-1], "\t".join(("%0.3f" % prec_results[m] for m in MS)))) # Save the model model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, None, mdl) except Exception as e: traceback.print_exc() print("Abandoning fold %d due to the error : %s" % (fold, str(e))) finally: fold += 1 return model_files
def cross_val_and_eval_perplexity(data, mdl, sample_model, train_plan, query_plan, num_folds, fold_run_count=-1, model_dir= None): ''' Uses cross-validation go get the average perplexity. If folds == 1 a special path is triggered where perplexity is evaluated on the training data, and the results are not saved to disk, even if model_dir is not none :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param query_plan: the query play (number of iterations etc.) :param num_folds: the number of folds to cross validation :param fold_run_count: for debugging stop early after processing the number of the folds :param model_dir: if not none, the models are stored in this directory. :return: the list of model files stored ''' model_files = [] if fold_run_count < 1: fold_run_count = num_folds if num_folds == 1: model = mdl.newModelFromExisting(sample_model) query = mdl.newQueryState(data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = mdl.train(data, model, query, train_plan) likely = mdl.log_likelihood(data, model, train_tops) perp = perplexity_from_like(likely, data.word_count) print("Train-set Likelihood: %12f" % (likely)) print("Train-set Perplexity: %12f" % (perp)) model_files = save_if_necessary(model_files, model_dir, model, data, 0, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) return model_files query_like_sum = 0 # to calculate the overall likelihood and query_wcount_sum = 0 # perplexity for the whole dataset train_like_sum = 0 train_wcount_sum = 0 folds_finished = 0 # count of folds that finished successfully fold = 0 while fold < num_folds and folds_finished < fold_run_count: try: train_data, query_data = data.cross_valid_split(fold, num_folds) # Train the model print ("Duplicating model template... ", end="") model = mdl.newModelFromExisting(sample_model) print ("Done.\nCreating query state...") train_tops = mdl.newQueryState(train_data, model) print ("Starting training") model, train_tops, (train_itrs, train_vbs, train_likes) \ = mdl.train(train_data, model, train_tops, train_plan) train_like = mdl.log_likelihood (train_data, model, train_tops) train_word_count = train_data.word_count train_perp = perplexity_from_like(train_like, train_word_count) print ("DEBUG Train perplexity is " + str(train_perp)) # Query the model - if there are no features we need to split the text print ("Starting query.") query_estim, query_eval = query_data.doc_completion_split() query_tops = mdl.newQueryState(query_estim, model) model, query_tops = mdl.query(query_estim, model, query_tops, query_plan) query_like = mdl.log_likelihood(query_eval, model, query_tops) query_word_count = query_eval.word_count query_perp = perplexity_from_like(query_like, query_word_count) # Keep a record of the cumulative likelihood and query-set word-count train_like_sum += train_like train_wcount_sum += train_word_count query_like_sum += query_like query_wcount_sum += query_word_count folds_finished += 1 # Write out the output print("Fold %d: Train-set Perplexity: %12.3f \t Query-set Perplexity: %12.3f" % (fold, train_perp, query_perp)) print("") # Save the model model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, query_tops, mdl) # except Exception as e: # traceback.print_exc() # print("Abandoning fold %d due to the error : %s" % (fold, str(e))) finally: fold += 1 print ("Total (%d): Train-set Likelihood: %12.3f \t Train-set Perplexity: %12.3f" % (folds_finished, train_like_sum, perplexity_from_like(train_like_sum, train_wcount_sum))) print ("Total (%d): Query-set Likelihood: %12.3f \t Query-set Perplexity: %12.3f" % (folds_finished, query_like_sum, perplexity_from_like(query_like_sum, query_wcount_sum))) return model_files