def insample_lro_style_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None): ''' For documents with > 5 links remove a portion. The portion is determined by the number of folds (e.g. five-fold implied remove one fifth of links, three fold implies remove a third, etc.) Train on all documents and all remaining links. Predict remaining links. Evaluate using precision@m, recall@m, mean reciprocal-rank and mean average-precision Average all results. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param target_folds: the number of folds to complete before finishing. Set to folds by default :param model_dir: if not none, and folds > 1, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data if target_folds is None: target_folds = folds combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 fold_count = 0 for fold in range(folds): # try: # Prepare the training and query data train_data, query_data, docSubset = data.folded_link_prediction_split(MinLinkCountEval, fold, folds) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models print ("\n\nFold %d\n" % fold + ("-" * 80)) # Train the model if model_uses_lda(sample_model): model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopics) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities min_link_probs = mdl.min_link_probs(model, train_tops, train_tops, query_data.links, docSubset) predicted_link_probs = mdl.link_probs(model, train_tops, train_tops, min_link_probs, docSubset) expected_links = query_data.links[docSubset, :] # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print ("Fold %2d: Mean-Precisions at \n" % fold, end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) fold_count += 1 if fold_count == target_folds: break # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None): ''' Take a feature list. Train on all documents where none of those features are set. Remove the first element from the feature list, query all documents with that feature set, and then evaluate link prediction. Repeat until feature-list is empty. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param feature_mask: the list of features used to separate training from query This is a list of tuples, the left side is the feature label, the right side is the :param model_dir: if not none, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 while len(feature_mask) > 0: # try: # Prepare the training and query data feature_mask_indices = [i for _,i in feature_mask] train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices) (feat_label, feat_id) = feature_mask.pop(0) print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80)) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models # Train the model if model_uses_lda(sample_model): ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices) model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities query_tops = mdl.newQueryState(query_data, model) _, query_tops = mdl.query(query_data, model, query_tops, train_plan) min_link_probs = mdl.min_link_probs(model, train_tops, query_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs) expected_links = query_data.links # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def link_split_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None): ''' Train on all the words and half the links. Predict the remaining links. Evaluate using precision at m using as values of m 50, 100, 250, and 500, and additionally recall at m Cross validation may be used, but note we're always evaluating on training data. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param target_folds: the number of folds to complete before finishing. Set to folds by default :param model_dir: if not none, and folds > 1, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data if ldaModel is not None: (_, _, _, _, ldaModel, ldaTopics, _) = ldaModel if target_folds is None: target_folds = folds combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 for fold in range(target_folds): model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) \ if sample_model.name == LRO_MODEL_NAME \ else mdl.newModelFromExisting(sample_model) train_data, query_data = data.link_prediction_split(symmetric=False) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) min_link_probs = mdl.min_link_probs(model, train_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs) expected_links = query_data.links precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print ("Fold %2d: Mean-Precisions at \n" % fold, end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) return model_files