def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')): """Get metrics (ndcg and map by default) for a run compared to a qrel file. Arguments: qrel_file -- qrel file with ground truth data run_file -- predictions from the run metrics -- which metrics to evaluate on, can use any valid metrics that the trec_eval tool accepts Returns: metric_values -- dictionary of metric values (out of 100), rounded to two decimal places """ with open(qrel_file, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(run_file, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics)) results = evaluator.evaluate(run) metric_values = {} for measure in sorted(metrics): res = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()] ) metric_values[measure] = np.round(100 * res, 2) return metric_values
def cal_ndcg(qrels, trec, k): with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) metric = 'ndcg_cut_%d' % k if metric not in mes: print('Depth of NDCG not available.') exit() ndcg = mes[metric] return ndcg
def evaluate(eval_path, qrel_path, res_path): measures = {"map", "ndcg_cut", "recall", "P"} with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures) with open(res_path, 'r') as f_run: run = pytrec_eval.parse_run(f_run) all_metrics = evaluator.evaluate(run) metrics = { 'P_5': 0, 'P_10': 0, 'P_20': 0, 'ndcg_cut_5': 0, 'ndcg_cut_10': 0, 'ndcg_cut_20': 0, 'ndcg_cut_100': 0, 'map': 0, 'recall_100': 0 } nb_queries = len(all_metrics) for key, values in all_metrics.items(): for metric in metrics: metrics[metric] += values[metric] / nb_queries with open(eval_path, 'w') as f: json.dump(metrics, f)
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10', split: dict = None, split_idx: int = -1) -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) # partial evaluation if split is not None and split_idx >= 0: for qid in copy.deepcopy(run): if qid not in split[split_idx]: _ = run.pop(qid) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run') parser.add_argument('measure') args = parser.parse_args() assert os.path.exists(args.qrel) assert os.path.exists(args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, {args.measure}) results = evaluator.evaluate(run) def print_line(measure, scope, value): #scope = query_id = topic_id print('{:25s}{:8s}{:.22f}'.format(measure, scope, value)) avg_DCG = [] for query_id, query_measures in results.items(): for measure, value in sorted(query_measures.items()): avg_DCG.append(value) print_line(measure, query_id, value) print(avg_DCG) print(mean(avg_DCG)) print(' avg of nDCG {:f}'.format(mean(avg_DCG)))
def setup_evaluator_from_relevance_file(qrel_path, measures={ "map", "ndcg_cut", "recall", "P" }): with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) return pytrec_eval.RelevanceEvaluator(qrel, measures)
def pytrec_evaluation(runfile, qrelfile, measures = pytrec_eval.supported_measures): """ run trec_eval with "measures" from the Python interface """ with open(runfile, "r") as ranking: run = pytrec_eval.parse_run(ranking) with open(qrelfile, "r") as qrel: qrel = pytrec_eval.parse_qrel(qrel) evaluator = pytrec_eval.RelevanceEvaluator( qrel, measures) return evaluator.evaluate(run)
def eval_trec_file(run_file, ref_file): with open(run_file) as f: run=parse_run(f) with open(ref_file) as f: qrel = pytrec_eval.parse_qrel(f) results = eval_trec(run, qrel) avg=dict() for q in results: for k in results[q]: if k in avg: avg[k]+=results[q][k] else: avg[k]=results[q][k] for k in avg: avg[k] = avg[k]/len(results) return avg
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run', nargs=2) # A bit too strict, as it does not allow for parametrized measures, # but sufficient for the example. parser.add_argument( '--measure', #choices=pytrec_eval.supported_measures, required=True) args = parser.parse_args() assert os.path.exists(args.qrel) assert all(map(os.path.exists, args.run)) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run[0], 'r') as f_run: first_run = pytrec_eval.parse_run(f_run) with open(args.run[1], 'r') as f_run: second_run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, {args.measure}) first_results = evaluator.evaluate(first_run) print(first_results.keys()) second_results = evaluator.evaluate(second_run) query_ids = list(set(first_results.keys()) & set(second_results.keys())) first_scores = [ first_results[query_id][args.measure] for query_id in query_ids ] second_scores = [ second_results[query_id][args.measure] for query_id in query_ids ] print(scipy.stats.ttest_rel(first_scores, second_scores))
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run') args = parser.parse_args() assert os.path.exists(args.qrel) assert os.path.exists(args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. for measure in sorted(query_measures.keys()): print_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]))
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10') -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def read_collection(collection_path, k=5): """Function that for every TREC collection reads queries , create folds for the Kfold cross validation ,reads the collection qrels ,save qrels and queries for each fold,reads documents on xml format and saves them into csv format""" #HR queries = read_queries(collection_path + '/queries') folds = build_folds(list(queries.keys()), k=k) with open(collection_path + '/qrels', 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) for i, fold in enumerate(folds): if not os.path.exists(collection_path + '/fold' + str(i)): os.makedirs(collection_path + '/fold' + str(i)) save_qrel(collection_path + '/fold' + str(i) + '/qrels', qrel, fold) save_queries_csv(collection_path, queries, folds) documents = read_documents(collection_path + '/documents.xml') save_documents_csv(collection_path, documents)
def __test(self): with open(os.path.join(TREC_EVAL_TEST_DIR, ground_truth_filename)) as \ f_trec_eval: trec_eval_output = parse_trec_eval(f_trec_eval) measures = set( measure if measure in pytrec_eval.supported_measures else prefix_match(measure, pytrec_eval.supported_measures) for measure in trec_eval_output['all'].keys()) with open(os.path.join(TREC_EVAL_TEST_DIR, qrel_filename)) as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(os.path.join(TREC_EVAL_TEST_DIR, run_filename)) as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures, **kwargs) results = evaluator.evaluate(run) expected_measures = trec_eval_output['all'] for measure in expected_measures: agg_measure_value = pytrec_eval.compute_aggregated_measure( measure, [ query_measure_values[measure] for query_measure_values in results.values() ]) ground_truth_agg_measure_value = \ trec_eval_output['all'][measure] self.assertAlmostEqual(agg_measure_value, ground_truth_agg_measure_value, places=3, msg=measure)
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # parse and store qrels if FLAGS.qrels_fname: with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision else: print("please provide qrels filename") return False """ LEXICAL PREPROCESSING """ # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) """ SEMANTIC PREPROCESSING """ # load required data print( 'load processed data required to perform re-ranking over lexical model w/ semantic model' ) with open(data_folder + '/docs.json', 'r') as cf: corpus = json.load(cf) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dictionary reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) # compute doc embeddings doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) """ COMPUTE RE-RANKING """ # set random seed np.random.seed(FLAGS.seed) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking(qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking(run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print('fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [qscore[FLAGS.ref_measure] for qscore in test_res.values()]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking(qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', nargs="?", type=str) args = parser.parse_args() config = json.load(open(args.config, 'r')) IR_models = [ mz.models.list_available()[i] for i in config["index_mz_models"] ] with open(config["collection_path"] + '/test/qrels', 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(config["measures"])) bm25_res = json.load( open(config["collection_path"] + '/test/' + 'BM25.metrics.json', 'r')) with open(config["collection_path"] + '/test/' + 'BM25.res', 'r') as f_run: bm25_run = pytrec_eval.parse_run(f_run) bm25_results = evaluator.evaluate(bm25_run) _ = "" for key, value in bm25_res.items(): if key in config["print_measures"]: _ += str(value)[:6] + " & " print('BM25 & ' + _[:-2] + '\\\\') all_res = dict() for model_class in IR_models: validation_path = config[ "collection_path"] + '/validation/' + model_class.__name__ test_path = config["collection_path"] + '/test/' + model_class.__name__ if os.path.exists(validation_path) and os.path.exists(test_path): best_model = "" best_metric = 0 for file in os.listdir(validation_path): if '.json' in file: val_res = json.load(open(validation_path + '/' + file, 'r')) if val_res[config["optim_measure"]] > best_metric: best_model = file best_metric = val_res[config["optim_measure"]] if best_model != "" and os.path.exists(test_path + '/' + best_model): test_res = json.load(open(test_path + '/' + best_model, 'r')) all_res[model_class.__name__] = [best_model, test_res] with open( config["collection_path"] + '/test/' + model_class.__name__ + '/' + best_model[:-12] + 'res', 'r') as f_run: run = pytrec_eval.parse_run(f_run) results = evaluator.evaluate(run) query_ids = list( set(bm25_results.keys()) & set(results.keys())) _ = "" for key, value in test_res.items(): if key in config["print_measures"]: bm25_scores = [ bm25_results[query_id][key] for query_id in query_ids ] scores = [ results[query_id][key] for query_id in query_ids ] test = scipy.stats.ttest_rel(bm25_scores, scores) _ += str(value)[:6] if test[0] < 0: if test[1] < 0.01 / len(config["print_measures"]): _ += "\\textsuperscript{\\textbf{++}}" elif test[1] < 0.05 / len( config["print_measures"]): _ += "\\textsuperscript{\\textbf{+}}" else: if test[1] < 0.01 / len(config["print_measures"]): _ += "\\textsuperscript{\\textbf{-\,-}}" elif test[1] < 0.05 / len( config["print_measures"]): _ += "\\textsuperscript{\\textbf{-}}" _ += " & " print(model_class.__name__ + ' & ' + _[:-2] + '\\\\')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--coll_path', nargs="?", type=str) parser.add_argument('-i', '--indexed_path', nargs="?", type=str) parser.add_argument('-p', '--plot_path', nargs="?", type=str) parser.add_argument('-r', '--results_path', nargs="?", type=str) parser.add_argument('-n', '--experiment_name', nargs="?", type=str) args = parser.parse_args() print(args, flush=True) if not os.path.exists(args.results_path + '/validation/' + args.experiment_name): os.makedirs(args.results_path + '/validation/' + args.experiment_name) if not os.path.exists(args.results_path + '/test/' + args.experiment_name): os.makedirs(args.results_path + '/test/' + args.experiment_name) if not os.path.exists(args.plot_path + '/validation/'): os.makedirs(args.plot_path + '/validation/') if not os.path.exists(args.plot_path + '/test/'): os.makedirs(args.plot_path + '/test/') #Initializing the results plot values for the different models #HR validation_plot_values = dict() test_plot_values = dict() for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: validation_plot_values[model_name] = [[], []] test_plot_values[model_name] = [[], []] #Loading indexed collection #HR Collection = wikIR_Collection.Collection() with open(args.indexed_path, 'rb') as f: Collection = pickle.load(f) Collection.doc_index[-1] = "-1" Collection.doc_index["-1"] = -1 #Loading validation and test query relavance values #HR with open(args.coll_path + 'validation/qrels', 'r') as f_qrel: validation_qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.coll_path + 'test/qrels', 'r') as f_qrel: test_qrel = pytrec_eval.parse_qrel(f_qrel) print('------------------------------start--------------------------', flush=True) #Evaluating the baseline models without TDV weights and saving the results of validation and test partitions #HR utils.eval_baseline_index_wikir(args.coll_path, Collection, validation_qrel, test_qrel, validation_plot_values, test_plot_values, args.results_path, args.experiment_name, 0) #Printing ndcg5 values for validation and test partitions #HR ndcg5_val = dict() ndcg5_test = dict() for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: ndcg5_val[model_name] = validation_plot_values[model_name][1][0][ 'ndcg_cut_5'] print("ndcg5 validation ", model_name, " of collection ", os.path.basename(args.coll_path), " ", ndcg5_val[model_name], flush=True) ndcg5_test[model_name] = test_plot_values[model_name][1][0][ 'ndcg_cut_5'] print("ndcg5 test ", model_name, " of collection ", os.path.basename(args.coll_path), " ", ndcg5_test[model_name], flush=True)
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name index_folder = 'corpus/' + FLAGS.corpus_name + '/index' # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) # if not os.path.exists(model_folder): # os.makedirs(model_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # set random seed - enable reproducibility np.random.seed(FLAGS.seed) # establish connection with UMLS db umls_lookup = umls.UMLSLookup() # load required data print( 'load processed data required to retrofit word vectors and perform retrieval tasks' ) with open(data_folder + '/docs.json', 'r') as df: corpus = json.load(df) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dict reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # pre process relational data if not os.path.exists(data_folder + '/term2cui.json'): # map terms to cuis using QuickUMLS term2cui = tf_utils.get_term2cui(word_dict, data_folder, threshold=FLAGS.threshold, stypes_fname=FLAGS.stypes_fname) else: # laod (term, cui) pairs print('load (term, cui) pairs') with open(data_folder + '/term2cui.json', 'r') as tcf: term2cui = json.load(tcf) """ SEMANTIC PROCESSING """ # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) """ RETROFITTING """ if FLAGS.retrofit: # get synonyms for each word within vocabulary print('get synonyms') syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup) if FLAGS.syn_weights: # convert collection frequencies from list to dict cfs = dict(cfs) else: cfs = None # retrofit word vectors print('retrofit word vectors for {} iterations'.format( FLAGS.iterations)) word_embs = retrofit(word_embs, syns, reverse_word_dict, FLAGS.iterations, alpha=1.0, beta=FLAGS.beta, cfs=cfs) # compute doc embeddings print('compute document vectors w/ retrofitted word vectors') doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) if not FLAGS.reranking: """ RETRIEVAL """ print('perform retrieval over the entire collection') # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # set query embs and ids q_embs = [] q_ids = [] # loop over queries and generate rankings for qid, qtext in q.items(): # prepare queries for semantic matching q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict, word_embs) if q_proj is None: print('query {} does not contain known terms'.format(qid)) else: q_embs.append(q_proj) q_ids.append(qid) q_embs = np.array(q_embs) # perform search and evaluate model effectiveness tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs, rankings_folder, FLAGS.model_name) scores = tf_utils.evaluate( ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder, FLAGS.model_name, qrels_folder, FLAGS.qrels_fname) else: """ RE-RANKING """ print('perform re-ranking over top 1000 documents from a baseline run') # parse and store qrels with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking( run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print( 'fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [ qscore[FLAGS.ref_measure] for qscore in test_res.values() ]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--index', type=pyndri.utils.existing_directory_path, required=True) parser.add_argument('--limit_queries_for_debug', type=pyndri.utils.positive_int, default=None) parser.add_argument('--test_set_size', type=float, default=None) parser.add_argument('--num_epochs', type=pyndri.utils.positive_int, default=500) parser.add_argument('--queries', type=pyndri.utils.existing_file_path, required=True) parser.add_argument('--query_relevance', type=pyndri.utils.existing_file_path, required=True) parser.add_argument('--trace_output', type=pyndri.utils.nonexisting_file_path, required=True) args = parser.parse_args() args.index = pyndri.Index(args.index) try: pyndri.utils.configure_logging(args) except IOError: return -1 qrel = {} env = RetrievalEnv(args.index, max_num_expanded_query_terms=5) with open(args.queries, 'r') as f_queries: queries = list(pyndri.utils.read_queries(f_queries).items()) with open(args.query_relevance, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) queries_idx = np.array(list(range(len(queries)))) np.random.shuffle(queries_idx) if args.limit_queries_for_debug: queries_idx = queries_idx[:args.limit_queries_for_debug] if args.test_set_size and args.test_set_size > 0: train_queries_idx, test_queries_idx = \ sklearn.model_selection.train_test_split( queries_idx, test_size=args.test_set_size) logging.info('Split query set into train=%s and test=%s.', train_queries_idx.size, test_queries_idx.size) def evaluate(agent): episode_count = 1 max_steps = 10 logging.info('Evaluating %s using %d queries.', agent, len(test_queries_idx)) ndcgs = [] for idx, query_idx in enumerate(test_queries_idx): reward = 0 done = False for i in range(episode_count): query_id, query_str = queries[query_idx] ob = env._reset(query_str, qrel[query_id]) for _ in range(max_steps): action = agent.act(ob, reward, done, deterministic=True) if action is not None: ob, reward, done, _ = env.step(action) if done: break logging.debug('Query %s: %.4f -> %.4f', query_id, env.original_utility, env.state['utility']) ndcgs.append(env.state['utility']) if idx > 0 and (idx + 1) % 10 == 0: logging.info('Finished %d out of %d queries.', idx + 1, len(test_queries_idx)) return ndcgs else: train_queries_idx = queries_idx def evaluate(agent): return np.nan, if args.trace_output: f_trace_out = open(args.trace_output, 'w') else: f_trace_out = None agents = [ NullAgent(), RandomAgent(env.action_space), TabularQAgent(env.observation_space, env.action_space) ] ndcg_per_agent = {} for agent in agents: if agent.can_learn(): logging.info('Training %s using %d queries.', agent, len(train_queries_idx)) avg_rewards = [] test_set_ndcgs = [] start_time = time.time() for epoch_idx in range(args.num_epochs): logging.info('Epoch %d.', epoch_idx + 1) np.random.shuffle(train_queries_idx) avg_reward = 0.0 for idx, query_idx in enumerate(train_queries_idx): query_id, query_str = queries[query_idx] relevance = qrel[query_id] logging.debug('Learning from %s.', query_id) total_reward = agent.learn(env, query_str, relevance) if total_reward is not None: avg_reward += total_reward if idx > 0 and (idx + 1) % 500 == 0: logging.info('Finished %d out of %d queries.', idx + 1, len(train_queries_idx)) avg_reward /= len(train_queries_idx) avg_rewards.append(avg_reward) epoch_finish_time = time.time() epoch_data = { 'agent': agent.name, 'epoch_idx': epoch_idx, 'train_avg_reward': avg_reward, 'seconds_since_start': epoch_finish_time - start_time, } logging.info('Average rewards: %s', avg_rewards) if (epoch_idx + 1) % 10 == 0: test_set_ndcg = np.mean(evaluate(agent)) test_set_ndcgs.append(test_set_ndcg) logging.info('Test set NDCGs: %s', test_set_ndcgs) epoch_data['test_set_ndcg'] = test_set_ndcg if f_trace_out: f_trace_out.write(json.dumps(epoch_data)) f_trace_out.write('\n') f_trace_out.flush() ndcgs = evaluate(agent) logging.info('NDCG: %.4f', np.mean(ndcgs)) ndcg_per_agent[agent] = np.mean(ndcgs) logging.info('%s', ndcg_per_agent) if f_trace_out: f_trace_out.close()
def main(): # parsing arguments parser = argparse.ArgumentParser() parser.add_argument('-c', '--coll_path', nargs="?", type=str) parser.add_argument('-i', '--indexed_path', nargs="?", type=str) parser.add_argument('-p', '--plot_path', nargs="?", type=str) parser.add_argument('-r', '--results_path', nargs="?", type=str) parser.add_argument('-f', '--folds', nargs="?", type=int, default=5) parser.add_argument('-n', '--experiment_name', nargs="?", type=str) args = parser.parse_args() print(args, flush=True) # Loading indexed collection Collection = TrecCollection() with open(args.indexed_path, 'rb') as f: Collection = pickle.load(f) Collection.doc_index[-1] = "-1" Collection.doc_index["-1"] = -1 # Loading relevance judgements from collection with open(args.coll_path + 'qrels', 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) # ???? id_titl = Collection.vocabulary['titl'] for i in range(len(Collection.all_indexed_queries)): if Collection.all_indexed_queries[i][0] == id_titl and len( Collection.all_indexed_queries[i]) > 1: del Collection.all_indexed_queries[i][0] for i in range(len(Collection.indexed_queries)): for j in range(len(Collection.indexed_queries[i])): if Collection.indexed_queries[i][j][0] == id_titl and len( Collection.indexed_queries[i][j]) > 1: del Collection.indexed_queries[i][j][0] print('---------------------start-------------------', flush=True) # Getting collection vocabulary size and total number of elements in collection coll_vocab_size, coll_tot_nb_elem = utils.evaluate_inverted_index( Collection.inverted_index) # Creating for each fold and for a certain experiment a directory for results and plots data plot_values_folds_list = [] for fold in range(args.folds): plot_values = dict() for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: plot_values[model_name] = [[], []] if not os.path.exists(args.results_path + '/fold' + str(fold) + '/' + args.experiment_name): os.makedirs(args.results_path + '/fold' + str(fold) + '/' + args.experiment_name) if not os.path.exists(args.plot_path + '/fold' + str(fold) + '/'): os.makedirs(args.plot_path + '/fold' + str(fold) + '/') # Computing metrics for baseline models for a certain fold and updating plot_values dictionnary utils.eval_baseline_index_trec(args.coll_path, Collection, fold, qrel, plot_values, args.results_path, args.experiment_name, 0) # appending plot values to the list plot_values_folds_list.append(plot_values) #Evaluating baseline models without training. ndcg5 = dict() for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: ndcg5[model_name] = [] for fold in range(args.folds): for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: ndcg5[model_name].append( plot_values_folds_list[fold][model_name][1][0]['ndcg_cut_5']) for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: average = sum(ndcg5[model_name]) / args.folds maximum = max(ndcg5[model_name]) print("ndcg5 ", model_name, " average of folds", average, " of collection ", os.path.basename(args.coll_path), flush=True) print("ndcg5 ", model_name, " max of folds", maximum, " of collection ", os.path.basename(args.coll_path), flush=True) print("-----------------Finished-------------------", flush=True)
def read_qrels(qrels): if platform.system().lower().startswith("win"): return dict() with open(qrels, "r") as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) return qrel
def get_qrels_as_dict(qrel_file): assert os.path.exists(qrel_file) with open(qrel_file, 'r') as f_qrel: qrels = pytrec_eval.parse_qrel(f_qrel) return (qrels)
def strips(dataset): for id_, (query, text) in dataset.items(): query = query.split(" ") text = text.split(" ") id_ = id_ yield id_, query, text # Read dataset with open(dataset_path, "r") as dataset_file: dataset = eval(dataset_file.read()) # Transform dataset and get relevent words dataset_class = list(starmap(QueryText, strips(dataset))) with open(qrels_path, "r") as f: qrels = parse_qrel(f) # Prepare embeddings model = fastText.load_model("/local/pouyet/py37/models/wiki.en.bin") for x in dataset_class: x.compute_embedding(model) x.qrels = qrels[str(x._id)] with open(dataset_classes_path, "wb") as f: pickle.dump(dataset_class, f) # Build torch dataset dataset_torch = KeyWordSelectionDataset(querytext_list=dataset_class) torch.save(dataset_torch, torchdataset_path)
def main(): #enabling eager execution of tensorflow. It is enabled in version 2 but not in version1 #HR tf.enable_eager_execution() #parsing arguments #HR parser = argparse.ArgumentParser() parser.add_argument('-c', '--coll_path', nargs="?", type=str) parser.add_argument('-i', '--indexed_path', nargs="?", type=str) parser.add_argument('-p', '--plot_path', nargs="?", type=str) parser.add_argument('-r', '--results_path', nargs="?", type=str) parser.add_argument('-w', '--weights_path', nargs="?", type=str) parser.add_argument('-f', '--folds', nargs="?", type=int, default=5) parser.add_argument('-e', '--nb_epoch', nargs="?", type=int) parser.add_argument('-l', '--l1_weight', nargs="?", type=float) parser.add_argument('-d', '--dropout_rate', nargs="?", type=float, default=0.0) parser.add_argument('--lr', nargs="?", type=float) parser.add_argument('-n', '--experiment_name', nargs="?", type=str) parser.add_argument('--IR_model', nargs="?", type=str, default='tf') parser.add_argument('-u', '--update_embeddings', action="store_true") args = parser.parse_args() print(args, flush=True) # Loading indexed collection #HR Collection = TrecCollection() with open(args.indexed_path, 'rb') as f: Collection = pickle.load(f) Collection.doc_index[-1] = "-1" Collection.doc_index["-1"] = -1 # Loading relevance judgements from collection #HR with open(args.coll_path + 'qrels', 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) # ???? #HR id_titl = Collection.vocabulary['titl'] for i in range(len(Collection.all_indexed_queries)): if Collection.all_indexed_queries[i][0] == id_titl and len( Collection.all_indexed_queries[i]) > 1: print("found it at ", i, " ", Collection.all_indexed_queries[i][0]) del Collection.all_indexed_queries[i][0] for i in range(len(Collection.indexed_queries)): for j in range(len(Collection.indexed_queries[i])): if Collection.indexed_queries[i][j][0] == id_titl and len( Collection.indexed_queries[i][j]) > 1: del Collection.indexed_queries[i][j][0] print('---------------------start-------------------', flush=True) # Getting collection vocabulary size and total number of elements in collection #HR coll_vocab_size, coll_tot_nb_elem = utils.evaluate_inverted_index( Collection.inverted_index) # Creating for each fold and for a certain experiment a directory for results,weights and plots data #HR for fold in range(args.folds): plot_values = dict() for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: plot_values[model_name] = [[], []] if not os.path.exists(args.results_path + '/fold' + str(fold) + '/' + args.experiment_name): os.makedirs(args.results_path + '/fold' + str(fold) + '/' + args.experiment_name) if not os.path.exists(args.weights_path + '/fold' + str(fold) + '/' + args.experiment_name): os.makedirs(args.weights_path + '/fold' + str(fold) + '/' + args.experiment_name) if not os.path.exists(args.plot_path + '/fold' + str(fold) + '/'): os.makedirs(args.plot_path + '/fold' + str(fold) + '/') # Computing metrics for baseline models for a certain fold and updating plot_values dictionnary #HR #HR modified the eval_baseline_index to a function eval_baseline_index_trec # The previous version did not work because of different call parameters utils.eval_baseline_index_trec(args.coll_path, Collection, fold, qrel, plot_values, args.results_path, args.experiment_name, 0) # Saving plot_values dict of a particular fold as a pickle #HR pickle.dump( plot_values, open( args.plot_path + '/fold' + str(fold) + '/' + args.experiment_name, 'wb')) # Initialization of batch size, the loss function,te optimizer and the model to train #HR batch_gen_time = [] batch_size = 32 y_true = tf.ones(batch_size, ) loss_function = tf.keras.losses.Hinge() optimizer = tf.keras.optimizers.Adam(args.lr) if args.IR_model == 'tf': model = differentiable_models.diff_simple_TF( Collection.embedding_matrix, dropout_rate=args.dropout_rate) elif args.IR_model == 'tf_idf': model = differentiable_models.diff_TF_IDF( Collection.embedding_matrix, dropout_rate=args.dropout_rate) elif args.IR_model == 'DIR': model = differentiable_models.diff_DIR( Collection.embedding_matrix, dropout_rate=args.dropout_rate) elif args.IR_model == 'BM25': model = differentiable_models.diff_BM25( Collection.embedding_matrix, dropout_rate=args.dropout_rate) #HR added JM model elif args.IR_model == 'JM': model = differentiable_models.diff_JM( Collection.embedding_matrix, dropout_rate=args.dropout_rate) # Training the model #HR print("Start training for fold ", fold, " ", args.experiment_name, flush=True) epoch = 0 prop_elem_index = 1.0 while epoch < args.nb_epoch and prop_elem_index > 0.05: begin = time.time() # generation of batches from the trec collection for training #HR query_batches, positive_doc_batches, negative_doc_batches = Collection.generate_training_batches( fold, batch_size) rank_loss = 0.0 reg_loss = 0.0 all_non_zero = 0.0 begin = time.time() for i in range(len(query_batches)): with tf.GradientTape() as tape: # reshaping queries, pos_documents and neg_documents into a numpy ndarray #HR queries = tf.keras.preprocessing.sequence.pad_sequences( [ Collection.all_indexed_queries[j] for j in query_batches[i] ], padding='post') pos_documents = tf.keras.preprocessing.sequence.pad_sequences( [ Collection.indexed_docs[j] for j in positive_doc_batches[i] ], padding='post') neg_documents = tf.keras.preprocessing.sequence.pad_sequences( [ Collection.indexed_docs[j] for j in negative_doc_batches[i] ], padding='post') # Creating sparse querie, pos_document and neg_documents indexes #HR q_sparse_index = [[column, j] for j, raw in enumerate(queries) for column in raw] pos_d_sparse_index = [[ column, j ] for j, raw in enumerate(pos_documents) for column in raw] neg_d_sparse_index = [[ column, j ] for j, raw in enumerate(neg_documents) for column in raw] # computing relevance and dense document for the negative and positive documents in the batch #HR pos_res, pos_d = model( np.clip(queries, 0, 1).astype(np.float32), queries, q_sparse_index, pos_documents, pos_d_sparse_index) neg_res, neg_d = model( np.clip(queries, 0, 1).astype(np.float32), queries, q_sparse_index, neg_documents, neg_d_sparse_index) # Computing the hinge loss and the regularization loss and total loss #HR ranking_loss = loss_function(y_true=y_true, y_pred=pos_res - neg_res) regularization_loss = tf.norm(pos_d + neg_d, ord=1) rank_loss += ranking_loss.numpy() reg_loss += regularization_loss.numpy() all_non_zero += tf.math.count_nonzero(pos_d + neg_d).numpy() loss = ( 1.0 - args.l1_weight ) * ranking_loss + args.l1_weight * regularization_loss # Calculating gradients #HR if args.update_embeddings: gradients = tape.gradient(loss, model.trainable_variables) else: gradients = tape.gradient( loss, model.trainable_variables[1:]) # Back propagating the gradients #HR if args.update_embeddings: optimizer.apply_gradients( zip(gradients, model.trainable_variables)) else: optimizer.apply_gradients( zip(gradients, model.trainable_variables[1:])) # Compute the TDVs after the training and saving them #HR weights = model.compute_index() pickle.dump( weights, open( args.weights_path + '/fold' + str(fold) + '/' + args.experiment_name + '/epoch_' + str(epoch), 'wb')) inverted_index, redefined_idf, redefined_docs_length, redefined_c_freq = utils.utils_compute_info_retrieval( Collection, weights, weighted=True) #JF # inverted_index,idf,docs_length,c_freq = utils.compute_info_retrieval(Collection, # weights, # weighted=False) # Computing new vocab_size and total number of elements after introducting the TDV #HR vocab_size, tot_nb_elem = utils.evaluate_inverted_index( inverted_index) print( str(100 * vocab_size / coll_vocab_size)[0:5] + '% of the vocabulary is kept') print(str(100 * tot_nb_elem / coll_tot_nb_elem)[0:5] + '% of the index is kept', flush=True) prop_elem_index = tot_nb_elem / coll_tot_nb_elem #Evaluating baseline models with their new inverted index and new idf, doc length and collection frequencies #HR #HR modified the eval_learned_index to a function eval_learned_index_trec # The previous version did not work because of a different call parameters utils.eval_learned_index_trec( args.coll_path, Collection, args.IR_model, model, qrel, plot_values, args.plot_path, fold, inverted_index, weights, redefined_idf, redefined_docs_length, redefined_c_freq, # idf, # docs_length, # c_freq, prop_elem_index, args.results_path, args.experiment_name, epoch + 1) epoch += 1 print("finish training for fold ", fold, " ", args.experiment_name, flush=True) #HR print("-----------------Finished-------------------", flush=True) #HR
qid, qtext = line.strip().split('\t') qtext = re.sub(r'[^\w\s]', ' ', qtext) qtokens = [word for word in qtext.strip().split(' ') if word != ''] queries[qid] = qtokens folds = {} for collection in collections: with open(os.path.join('..', 'queries', 'json', collection + '.json')) as f: folds[collection] = json.load(f) with open(el_path) as f: qid_entities = json.load(f) with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut'}) redirects = {} with open(redirects_path) as f: for line in f: if not line.startswith('#'): subj, pred, obj = line.split(maxsplit=2) obj = obj[:obj.rfind('.')].strip() redirects[subj] = obj with open(ir_run_path, "r") as ir_run_file: ir_run = pytrec_eval.parse_run(ir_run_file) model = Word2Vec.load(args.model)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--coll_path', nargs="?", type=str) parser.add_argument('-i', '--indexed_path', nargs="?", type=str) parser.add_argument('-p', '--plot_path', nargs="?", type=str) parser.add_argument('-r', '--results_path', nargs="?", type=str) parser.add_argument('-w', '--weights_path', nargs="?", type=str) parser.add_argument('-e', '--nb_epoch', nargs="?", type=int) parser.add_argument('-l', '--l1_weight', nargs="?", type=float) parser.add_argument('-n', '--experiment_name', nargs="?", type=str) parser.add_argument('-u', '--update_embeddings', action="store_true") #HR added the choice of a particular differentiable model. There was no choices #in the original file parser.add_argument('--IR_model', nargs="?", type=str, default='tf') #HR added the option to choose a lerning rate and dropout rate parser.add_argument('--lr', nargs="?", type=float) parser.add_argument('-d', '--dropout_rate', nargs="?", type=float, default=0.0) args = parser.parse_args() print(args, flush=True) if not os.path.exists(args.results_path + '/validation/' + args.experiment_name): os.makedirs(args.results_path + '/validation/' + args.experiment_name) if not os.path.exists(args.results_path + '/test/' + args.experiment_name): os.makedirs(args.results_path + '/test/' + args.experiment_name) if not os.path.exists(args.weights_path + '/' + args.experiment_name): os.makedirs(args.weights_path + '/' + args.experiment_name) if not os.path.exists(args.plot_path + '/validation/'): os.makedirs(args.plot_path + '/validation/') if not os.path.exists(args.plot_path + '/test/'): os.makedirs(args.plot_path + '/test/') #Initializing the results plot values for the different models #HR validation_plot_values = dict() test_plot_values = dict() for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']: validation_plot_values[model_name] = [[], []] test_plot_values[model_name] = [[], []] #Loading indexed collection #HR Collection = wikIR_Collection.Collection() with open(args.indexed_path, 'rb') as f: Collection = pickle.load(f) Collection.doc_index[-1] = "-1" Collection.doc_index["-1"] = -1 #Loading validation and test query relavance values #HR with open(args.coll_path + 'validation/qrels', 'r') as f_qrel: validation_qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.coll_path + 'test/qrels', 'r') as f_qrel: test_qrel = pytrec_eval.parse_qrel(f_qrel) print('------------------------------start--------------------------', flush=True) #Computing collection vocabulary size and total number of elements #HR coll_vocab_size, coll_tot_nb_elem = utils.evaluate_inverted_index( Collection.inverted_index) #Evaluating the baseline models without TDV weights and saving the results of calidation and test partitions #HR #HR modified the eval_baseline_index to a function eval_baseline_index_wikir # The previous version did not work because of a different call parameters utils.eval_baseline_index_wikir(args.coll_path, Collection, validation_qrel, test_qrel, validation_plot_values, test_plot_values, args.results_path, args.experiment_name, 0) pickle.dump( validation_plot_values, open(args.plot_path + '/validation/' + args.experiment_name, 'wb')) pickle.dump(test_plot_values, open(args.plot_path + '/test/' + args.experiment_name, 'wb')) #Initialization of batch size, the loss function and optimizer for the training #HR batch_gen_time = [] batch_size = 64 y_true = tf.ones(batch_size, ) loss_function = tf.keras.losses.Hinge() optimizer = tf.keras.optimizers.Adam(args.lr) #Loading the differentiable model used for the training #HR #HR added options for different IR models. In the original version only the # simple tf model was present if args.IR_model == 'tf': model = differentiable_models.diff_simple_TF( Collection.embedding_matrix, dropout_rate=args.dropout_rate) #HR elif args.IR_model == 'tf_idf': model = differentiable_models.diff_TF_IDF( Collection.embedding_matrix, dropout_rate=args.dropout_rate) #HR elif args.IR_model == 'DIR': model = differentiable_models.diff_DIR(Collection.embedding_matrix, dropout_rate=args.dropout_rate) #HR elif args.IR_model == 'BM25': model = differentiable_models.diff_BM25(Collection.embedding_matrix, dropout_rate=args.dropout_rate) #HR elif args.IR_model == 'JM': model = differentiable_models.diff_JM(Collection.embedding_matrix, dropout_rate=args.dropout_rate) #Starting the training print("Start training ", args.experiment_name, flush=True) epoch = 0 prop_elem_index = 1.0 while epoch < args.nb_epoch and prop_elem_index > 0.2: begin = time.time() #Generating batches from WikIR Collection for training #HR query_batches, positive_doc_batches, negative_doc_batches = Collection.generate_training_batches( batch_size) rank_loss = 0.0 reg_loss = 0.0 all_non_zero = 0.0 begin = time.time() for i in range(len(query_batches)): with tf.GradientTape() as tape: # reshaping queries, pos_documents and neg_documents into a numpy ndarray #HR queries = tf.keras.preprocessing.sequence.pad_sequences( [ Collection.indexed_training_queries[j] for j in query_batches[i] ], padding='post') pos_documents = tf.keras.preprocessing.sequence.pad_sequences( [ Collection.indexed_docs[j] for j in positive_doc_batches[i] ], padding='post') neg_documents = tf.keras.preprocessing.sequence.pad_sequences( [ Collection.indexed_docs[j] for j in negative_doc_batches[i] ], padding='post') # Creating sparse querie, pos_document and neg_documents indexes #HR q_sparse_index = [[column, j] for j, raw in enumerate(queries) for column in raw] pos_d_sparse_index = [[column, j] for j, raw in enumerate(pos_documents) for column in raw] neg_d_sparse_index = [[column, j] for j, raw in enumerate(neg_documents) for column in raw] # computing relevance and dense document for the negative and positive documents in the batch #HR pos_res, pos_d = model( np.clip(queries, 0, 1).astype(np.float32), queries, q_sparse_index, pos_documents, pos_d_sparse_index) neg_res, neg_d = model( np.clip(queries, 0, 1).astype(np.float32), queries, q_sparse_index, neg_documents, neg_d_sparse_index) #Computing the hinge loss , the regularization loss and total loss #HR ranking_loss = loss_function(y_true=y_true, y_pred=pos_res - neg_res) regularization_loss = tf.norm(pos_d + neg_d, ord=1) rank_loss += ranking_loss.numpy() reg_loss += regularization_loss.numpy() all_non_zero += tf.math.count_nonzero(pos_d + neg_d).numpy() loss = (1.0 - args.l1_weight ) * ranking_loss + args.l1_weight * regularization_loss #Calculating gradients #HR if args.update_embeddings: gradients = tape.gradient(loss, model.trainable_variables) else: gradients = tape.gradient(loss, model.trainable_variables[1:]) #Back propagating the gradients #HR if args.update_embeddings: optimizer.apply_gradients( zip(gradients, model.trainable_variables)) else: optimizer.apply_gradients( zip(gradients, model.trainable_variables[1:])) #Computing TDVs and saving them #HR weights = model.compute_index() pickle.dump( weights, open('weights/' + args.experiment_name + '/epoch_' + str(epoch), 'wb')) #updating the inverted index and computing the new idf, doc lengths and collection frequencies #HR inverted_index, redefined_idf, redefined_docs_length, redefined_c_freq = utils.compute_info_retrieval( Collection, weights, weighted=True) # Computing new vocab_size and total number of elements after introducting the TDV #HR vocab_size, tot_nb_elem = utils.evaluate_inverted_index(inverted_index) print( str(100 * vocab_size / coll_vocab_size)[0:5] + '% of the vocabulary is kept') print(str(100 * tot_nb_elem / coll_tot_nb_elem)[0:5] + '% of the index is kept', flush=True) prop_elem_index = tot_nb_elem / coll_tot_nb_elem #Evaluating baseline models with their new inverted index and new idf, doc length and collection frequencies #HR modified the eval_learned_index to a function eval_learned_index_wikir # The previous version did not work because of different call parameters utils.eval_learned_index_wikir( args.coll_path, Collection, args.IR_model, model, validation_qrel, test_qrel, validation_plot_values, test_plot_values, args.plot_path, inverted_index, redefined_idf, redefined_docs_length, redefined_c_freq, prop_elem_index, args.results_path, args.experiment_name, epoch) epoch += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--qrel', default= '/Users/woffee/www/emse-apiqa/QA2021/data/QA2021_stackoverflow4_qrel.txt' ) parser.add_argument( '--run', default='/Users/woffee/www/emse-apiqa/QA2021/data/pyltr_pred.txt') args = parser.parse_args() print("args.qrel:", args.qrel) print("args.run", args.run) assert os.path.exists(args.qrel) assert os.path.exists(args.run) final_auc, final_accuracy = calc_auc(args.qrel, args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) total = len(results.items()) sum_map = 0.0 for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): # print_line(measure, query_id, value) pass # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. print("==========") selected_measures = [ 'map', 'recip_rank', 'P_5', 'P_10', 'P_15', 'P_20', 'recall_5', 'recall_10', 'recall_15', 'recall_20', 'ndcg' ] eva_values = {} for measure in selected_measures: eva_values[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) # print_line( measure, 'all', eva_values[measure]) for measure in selected_measures: print_line(measure, 'all', eva_values[measure]) print( "%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (final_auc, final_accuracy, eva_values['map'], eva_values['recip_rank'], eva_values['P_5'], eva_values['P_10'], eva_values['P_15'], eva_values['P_20'], eva_values['recall_5'], eva_values['recall_10'], eva_values['recall_15'], eva_values['recall_20'], eva_values['ndcg']))