def eval_transformer(self, subsample: int = False): results = {} all_questions_ids = get_topics(task=self.task, subset=self.subset) all_questions = dict([(int(qid), self.post_parser.map_questions[int(qid)]) for qid in all_questions_ids]) # all_questions = dict([(int(qid), "Question %s content" % qid) for qid in all_questions_ids]) if subsample: all_questions = all_questions[:subsample] for i, (qid, question) in enumerate(all_questions.items()): results[str(qid)] = {} judged_answer_ids = get_judged_documents(task=self.task, subset=self.subset, topic=str(qid)) question_e = self.model.encode([question.body], batch_size=8) try: answers_bodies = [ self.post_parser.map_just_answers[int(aid)].body for aid in judged_answer_ids ] # answers_bodies = ["Answer %s body" % aid for aid in judged_answer_ids] except KeyError: print("Key error at qid %s" % qid) answers_bodies = [] # answers_bodies = ["Answer %s body" % aid for aid in judged_answer_ids] if not answers_bodies: print("No evaluated answers for question %s, dtype %s" % (qid, str(type(qid)))) continue answers_e = self.model.encode(answers_bodies, batch_size=8) answers_dists = cosine_similarity(np.array(question_e), np.array(answers_e))[0] for aid, answer_sim in sorted(zip(judged_answer_ids, answers_dists), key=lambda qid_dist: qid_dist[1], reverse=True): print(aid, answer_sim) results[str(qid)][str(aid)] = float(answer_sim) return results
def index_judged_questions(self, reload_embs_dir=False, subsample_to=2000): relevant_qs = dict() for relevant_qi in get_judged_documents(task='task1-votes', subset='validation'): try: parent_id = self.post_parser.map_just_answers[int( relevant_qi)].parent_id except KeyError as e: print( "IREvaluator error: judged answer %s was not loaded and can not be evaluated" % relevant_qi) raise e relevant_qs[parent_id] = self.post_parser.map_questions[parent_id] indexed_items = list(relevant_qs.items()) if subsample_to: # assures to choose the same questions every time random.seed(1234) indexed_items = random.sample(indexed_items, subsample_to) self.add_to_index(indexed_items, reload_embs_dir=reload_embs_dir)
def eval_transformer(model_dir: str, preproc: str, subsample: int = False): model = SentenceTransformer(model_dir, device=device) task = 'task1-votes' subset = 'validation' results = {} all_questions_ids = get_topics(task=task, subset=subset) all_questions = get_questions(all_questions_ids, preproc=preproc) if subsample: all_questions = all_questions[:subsample] for i, (qid, question) in enumerate(all_questions): results[qid] = {} judged_answer_ids = get_judged_documents(task=task, subset=subset, topic=str(qid)) question_e = model.encode([question.body], batch_size=8) answers_bodies = [ dr.post_parser.map_just_answers[int(aid)].body for aid in judged_answer_ids ] if not answers_bodies: print("No evaluated answers for question %s, dtype %s" % (qid, str(type(qid)))) continue answers_e = model.encode(answers_bodies, batch_size=8) answers_dists = cosine_similarity(np.array(question_e), np.array(answers_e))[0] if i % 100 == 0: print("Question %s of %s" % (i, len(all_questions))) for aid, answer_sim in sorted(zip(judged_answer_ids, answers_dists), key=lambda qid_dist: qid_dist[1], reverse=True): # print(aid, answer_sim) results[qid][aid] = float(answer_sim) ndcg_val = get_ndcg(results, task=task, subset=subset) return ndcg_val, results
from arqmath_eval import get_topics, get_judged_documents import numpy as np from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm task = 'task1-votes' subset = 'validation' results = {} all_questions_ids = get_topics(task=task, subset=subset) all_questions = dict([(int(qid), postproc_parser.map_questions[int(qid)]) for qid in all_questions_ids]) for i, (qid, question) in tqdm(list(enumerate(all_questions.items())), desc="Collecting answers for %s questions" % len(all_questions)): results[qid] = {} judged_answer_ids = get_judged_documents(task=task, subset=subset, topic=str(qid)) question_e = model_saved.encode([question.body], batch_size=8) answers_bodies = [postproc_parser.map_just_answers[int(aid)].body for aid in judged_answer_ids] if not answers_bodies: print("No evaluated answers for question %s, dtype %s" % (qid, str(type(qid)))) continue answers_e = model_saved.encode(answers_bodies, batch_size=8) answers_dists = cosine_similarity(np.array(question_e), np.array(answers_e))[0] for aid, answer_sim in sorted(zip(judged_answer_ids, answers_dists), key=lambda qid_dist: qid_dist[1], reverse=True): # print("Q %s, A %s: sim: %s" % (qid, aid, answer_sim)) results[qid][aid] = float(answer_sim) def report_ndcg_results(result_tsv_name: str, results: dict): with open(result_tsv_name, 'wt') as f: for topic, documents in results.items(): top_documents = sorted(documents.items(), key=lambda x: x[1], reverse=True)[:1000] for rank, (document, similarity_score) in enumerate(top_documents):
def get_common_parameters(result_type, math_representation, datasets, dataset_parameters): discard_math = math_representation == 'nomath' judged_results = result_type == 'judged' if result_type == 'judged': topic_corpus_filename = ARQMATH_COLLECTION_POSTS_FILENAMES[ math_representation if not discard_math else 'latex'] topic_corpus_num_documents = ARQMATH_COLLECTION_POSTS_NUM_DOCUMENTS topic_ids = get_topics(task=TASK[result_type], subset=SUBSET[result_type]) document_corpus_filename = topic_corpus_filename document_corpus_num_documents = topic_corpus_num_documents document_ids = get_judged_documents(task=TASK[result_type], subset=SUBSET[result_type]) topic_judgements = { topic_id: get_judged_documents(task=TASK[result_type], subset=SUBSET[result_type], topic=topic_id) for topic_id in topic_ids } else: topic_judgements = None if result_type == 'task1': topic_corpus_filename = ARQMATH_TASK1_POSTS_FILENAMES[ math_representation if not discard_math else 'latex'] topic_corpus_num_documents = ARQMATH_TASK1_TEST_POSTS_NUM_DOCUMENTS topic_ids = [ 'A.{}'.format(query_number + 1) for query_number in range(100) if (query_number + 1) not in (31, 78) ] document_corpus_filename = ARQMATH_COLLECTION_POSTS_FILENAMES[ math_representation if not discard_math else 'latex'] document_corpus_num_documents = ARQMATH_COLLECTION_POSTS_NUM_DOCUMENTS document_ids = get_judged_documents(task=TASK[result_type], subset=SUBSET[result_type]) elif result_type == 'task2': assert not discard_math topic_corpus_filename = ARQMATH_TASK2_FORMULAE_FILENAMES[ math_representation] topic_corpus_num_documents = ARQMATH_TASK2_FORMULAE_NUM_FORMULAE[ math_representation] topic_ids = set(( formula_id, *get_judged_documents( task=TASK[result_type], subset=SUBSET[result_type], topic=formula_id, ), ) for formula_id in get_topics(task=TASK[result_type], subset=SUBSET[result_type])) document_corpus_filename = ARQMATH_COLLECTION_FORMULAE_FILENAMES[ math_representation] document_corpus_num_documents = ARQMATH_COLLECTION_FORMULAE_NUM_FORMULAE[ math_representation] document_ids = None if isinstance(datasets, str): datasets = [datasets] json_filenames = [ DATASET_JSON_FILENAMES[dataset] [math_representation if not discard_math else 'latex'] for dataset in datasets ] json_nums_paragraphs = [ DATASET_NUMS_PARAGRAPHS[dataset] for dataset in datasets ] dataset_parameters = {**DATASET_DEFAULT_PARAMETERS, **dataset_parameters} dataset_formattable_parameter_string = parameters_to_string({ **dataset_parameters, **{ 'phrases': '{}' } }) phraser_filename = ARXMLIV_OUTPUT_FILENAME.format( math_representation, '{}.phraser'.format(dataset_formattable_parameter_string)) return { 'judged_results': judged_results, 'topic_judgements': topic_judgements, 'json_filenames': json_filenames, 'json_nums_paragraphs': json_nums_paragraphs, 'dataset_parameters': dataset_parameters, 'phraser_filename': phraser_filename, 'topic_ids': topic_ids, 'topic_corpus_filename': topic_corpus_filename, 'topic_corpus_num_documents': topic_corpus_num_documents, 'document_ids': document_ids, 'document_corpus_filename': document_corpus_filename, 'document_corpus_num_documents': document_corpus_num_documents, 'discard_math': discard_math, 'result_type': result_type, }