def results_analysis(): doc_results = common.load_json( # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json" ) doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # terms_based_results_list = common.load_jsonl( # config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") data_list = common.load_json(config.DEV_FULLWIKI_FILE) # data_list = common.load_json(config.TRAIN_FILE) append_baseline_context(doc_results, data_list) len_list = [] for rset in doc_results['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) ext_hotpot_eval.eval(doc_results, data_list)
def experiment_dev_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 data_list = common.load_json(config.DEV_FULLWIKI_FILE) terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") doc_retri_pred_dict = init_results_v8( data_list, data_list, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) common.save_json( doc_retri_pred_dict, "hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) common.save_json(new_doc_retri_pred_dict, "hotpot_dev_doc_retrieval_v8.json")
def full_wiki_baseline_upperbound(): dev_fullwiki = common.load_json(config.DEV_FULLWIKI_FILE) # dev_fullwiki = common.load_json(config.DEV_DISTRACTOR_FILE) upperbound_pred_file = dict() upperbound_pred_file['sp'] = dict() upperbound_pred_file['sp_doc'] = dict() upperbound_pred_file['p_answer'] = dict() # print(dev_fullwiki) for item in dev_fullwiki: qid = item['_id'] answer = item['answer'] contexts = item['context'] supporting_facts = item['supporting_facts'] # supporting_doc = set([fact[0] for fact in item['supporting_facts']]) # retrieved_doc_dict = set([context[0] for context in contexts]) retrieved_doc_dict = dict() for doc_title, context_sents in contexts: if doc_title not in retrieved_doc_dict: retrieved_doc_dict[doc_title] = dict() for i, sent in enumerate(context_sents): retrieved_doc_dict[doc_title][i] = sent upperbound_pred_doc = [] upperbound_pred_sp = [] found_answer = False for sp_doc, sp_fact_line_num in supporting_facts: if sp_doc in retrieved_doc_dict and sp_fact_line_num in retrieved_doc_dict[sp_doc]: upperbound_pred_doc.append(sp_doc) upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: found_answer = True p_answer = answer if found_answer else "" upperbound_pred_file['sp'][qid] = upperbound_pred_sp upperbound_pred_file['sp_doc'][qid] = upperbound_pred_doc upperbound_pred_file['p_answer'][qid] = p_answer if all([gt_fact in upperbound_pred_sp for gt_fact in supporting_facts]): # If we find all the evidence, to add additional yes/no answer. upperbound_pred_file['p_answer'][qid] = answer ext_hotpot_eval.eval(upperbound_pred_file, dev_fullwiki)
def load_and_eval(): top_k = 50 value_thrsehold = None tf_idf_dev_results = common.load_jsonl(config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") doc_pred_dict = {'sp_doc': dict()} for item in tqdm(tf_idf_dev_results): sorted_scored_list = sorted(item['doc_list'], key=lambda x: x[0], reverse=True) pred_list = [docid for _, docid in sorted_scored_list[:top_k]] # print(sorted_scored_list) qid = item['qid'] doc_pred_dict['sp_doc'][qid] = pred_list # break dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) ext_hotpot_eval.eval(doc_pred_dict, dev_fullwiki_list)
def eval_hotpot_s(): cur_dev_eval_results_list_out = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpot_p_level_effects/hotpot_s_level_dev_results_top_k_doc_100.jsonl" ) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') # cur_results_dict_v02 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5, # score_field_name='prob', # filter_value=0.2, # result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) # _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { # 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] print(em, pr, rec, f1)
def inspect_upstream_eval(): dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 # cur_results_dict_v05 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5, # score_field_name='prob', # filter_value=0.5, # result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') # _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'label': 'ema', 'v02': metrics_v2, # 'v05': metrics_v5, } print(logging_item)
def eval_p_level(): cur_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) print(metrics_top5)
def model_go(sent_filter_value, sent_top_k=5): seed = 12 torch.manual_seed(seed) bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = "bert-base-uncased" lazy = False forward_size = 32 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_rate = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 1000 do_lower_case = True debug = False max_pre_context_length = 320 max_query_length = 64 doc_stride = 128 qa_num_of_layer = 2 do_ema = True ema_device_num = 1 # s_filter_value = 0.5 s_filter_value = sent_filter_value # s_top_k = 5 s_top_k = sent_top_k experiment_name = f'hotpot_v0_qa_(s_top_k:{s_top_k},s_fv:{s_filter_value},qa_layer:{qa_num_of_layer})' print("Potential total length:", max_pre_context_length + max_query_length + 3) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) # Load Dataset. dev_list = common.load_json(config.DEV_FULLWIKI_FILE) train_list = common.load_json(config.TRAIN_FILE) dev_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) train_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) dev_fitem_dict, dev_fitem_list, dev_sp_results_dict = get_qa_item_with_upstream_sentence( dev_list, dev_sentence_level_results, is_training=False, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence( train_list, train_sentence_level_results, is_training=True, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) # print(len(dev_fitem_list)) # print(len(dev_fitem_dict)) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') if debug: dev_list = dev_list[:100] eval_frequency = 2 est_datasize = len(train_fitem_list) span_pred_reader = BertPairedSpanPredReader(bert_tokenizer=tokenizer, lazy=lazy, example_filter=None) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertSpan(bert_encoder, qa_num_of_layer) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=ema_device_num) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) iterator = BasicIterator(batch_size=batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] print("Total train instances:", len(train_fitem_list)) num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_rate, t_total=num_train_optimization_steps) dev_instances = span_pred_reader.read(dev_fitem_list) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix = None if not debug: file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) print("Resampling:") train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence( train_list, train_sentence_level_results, is_training=True, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) random.shuffle(train_fitem_list) train_instances = span_pred_reader.read(train_fitem_list) train_iter = iterator(train_instances, num_epochs=1, shuffle=True) for batch in tqdm(train_iter, desc="Batch Loop"): model.train() batch = allen_util.move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) gt_span = batch['gt_span'] loss = model(mode=BertSpan.ForwardMode.TRAIN, input_ids=paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, gt_span=gt_span) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr( model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: # print("Non-EMA EVAL:") # eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False) # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict, # device_num) # cur_results_dict = dict() # cur_results_dict['p_answer'] = cur_eval_dict # cur_results_dict['sp'] = dev_sp_results_dict # # _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) # # print(metrics) # # logging_item = { # 'score': metrics, # } # # joint_f1 = metrics['joint_f1'] # joint_em = metrics['joint_em'] # # print(logging_item) # # if not debug: # save_file_name = f'i({update_step})|e({epoch_i})' \ # f'|j_f1({joint_f1})|j_em({joint_em})|seed({seed})' # # # print(save_file_name) # logging_agent.incorporate_results({}, save_file_name, logging_item) # logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") # # model_to_save = model.module if hasattr(model, 'module') else model # output_model_file = Path(file_path_prefix) / save_file_name # torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: print("EMA EVAL") ema_model = ema.get_inference_model() ema_inference_device_ids = get_ema_gpu_id_list( master_device_num=ema_device_num) ema_model = ema_model.to(ema_device_num) ema_model = torch.nn.DataParallel( ema_model, device_ids=ema_inference_device_ids) dev_iter = iterator(dev_instances, num_epochs=1, shuffle=False) cur_eitem_list, cur_eval_dict = span_eval( ema_model, dev_iter, do_lower_case, dev_fitem_dict, ema_device_num, show_progress=False) cur_results_dict = dict() cur_results_dict['p_answer'] = cur_eval_dict cur_results_dict['sp'] = dev_sp_results_dict _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) print(metrics) print("---------------" * 3) logging_item = { 'label': 'ema', 'score': metrics, } joint_f1 = metrics['joint_f1'] joint_em = metrics['joint_em'] print(logging_item) if not debug: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|j_f1({joint_f1})|j_em({joint_em})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def eval_model(model_path, data_file=None, filter_value=0.5): seed = 12 torch.manual_seed(seed) bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = "bert-base-uncased" lazy = False forward_size = 16 batch_size = 32 do_lower_case = True debug = False max_pre_context_length = 320 max_query_length = 64 doc_stride = 128 qa_num_of_layer = 2 s_filter_value = filter_value s_top_k = 5 tag = 'dev' print("Potential total length:", max_pre_context_length + max_query_length + 3) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) # Load Dataset. dev_list = common.load_json(config.DEV_FULLWIKI_FILE) test_list = common.load_json(config.TEST_FULLWIKI_FILE) train_list = common.load_json(config.TRAIN_FILE) if data_file is None: dev_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) else: dev_sentence_level_results = common.load_jsonl(data_file) test_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl" ) train_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) dev_fitem_dict, dev_fitem_list, dev_sp_results_dict = get_qa_item_with_upstream_sentence( dev_list, dev_sentence_level_results, is_training=False, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) test_fitem_dict, test_fitem_list, test_sp_results_dict = get_qa_item_with_upstream_sentence( test_list, test_sentence_level_results, is_training=False, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) # train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence(train_list, train_sentence_level_results, # is_training=True, # tokenizer=tokenizer, # max_context_length=max_pre_context_length, # max_query_length=max_query_length, # filter_value=s_filter_value, # doc_stride=doc_stride, # top_k=s_top_k, # debug_mode=debug) if debug: dev_list = dev_list[:100] span_pred_reader = BertPairedSpanPredReader(bert_tokenizer=tokenizer, lazy=lazy, example_filter=None) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertSpan(bert_encoder, qa_num_of_layer) model.load_state_dict(torch.load(model_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) iterator = BasicIterator(batch_size=batch_size) if tag == 'dev': dev_instances = span_pred_reader.read(dev_fitem_list) # test_instances = span_pred_reader.read(test_fitem_list) eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False) # eval_iter = iterator(test_instances, num_epochs=1, shuffle=False) cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict, device_num, show_progress=True, pred_no_answer=True) # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, test_fitem_dict, # device_num, show_progress=True) cur_results_dict = dict() cur_results_dict['answer'] = cur_eval_dict cur_results_dict['sp'] = dev_sp_results_dict # cur_results_dict['sp'] = test_sp_results_dict # common.save_json(cur_results_dict, f"{tag}_qa_sp_results_{filter_value}_doctopk_5.json") cur_results_dict['p_answer'] = cur_eval_dict _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) # _, metrics = ext_hotpot_eval.eval(cur_results_dict, test_list, verbose=False) logging_item = { 'score': metrics, } print(data_file) print(logging_item) elif tag == 'test': # dev_instances = span_pred_reader.read(dev_fitem_list) test_instances = span_pred_reader.read(test_fitem_list) # eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False) eval_iter = iterator(test_instances, num_epochs=1, shuffle=False) # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict, # device_num, show_progress=True) cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, test_fitem_dict, device_num, show_progress=True) cur_results_dict = dict() cur_results_dict['answer'] = cur_eval_dict # cur_results_dict['sp'] = dev_sp_results_dict cur_results_dict['sp'] = test_sp_results_dict common.save_json(cur_results_dict, f"{tag}_qa_sp_results.json") cur_results_dict['p_answer'] = cur_eval_dict # _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) _, metrics = ext_hotpot_eval.eval(cur_results_dict, test_list, verbose=False) logging_item = { 'score': metrics, } print(logging_item)
def eval_model_for_downstream_ablation(model_saved_path, doc_top_k=2, tag='dev'): print(f"Run doc_top_k:{doc_top_k}") bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True # forward_size = 256 forward_size = 256 # batch_size = 64 batch_size = 128 do_lower_case = True document_top_k = doc_top_k debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) test_list = common.load_json(config.TEST_FULLWIKI_FILE) # Load train eval results list # cur_train_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl") cur_dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) # cur_test_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl") # if tag == 'train': # train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True, # debug_mode=debug_mode) if tag == 'dev': dev_fitems = get_sentence_pair(document_top_k, dev_list, cur_dev_eval_results_list, is_training=False, debug_mode=debug_mode) # elif tag == 'test': # test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False, # debug_mode=debug_mode) if debug_mode: eval_frequency = 2 # dev_list = dev_list[:10] # dev_fitems_list = dev_fitems_list[:296] # train_fitems_list = train_fitems_list[:300] # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if tag == 'train': train_instance = bert_cs_reader.read(train_fitems) elif tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'train': train_iter = biterator(train_instance, num_epochs=1, shuffle=False) print(len(train_fitems)) elif tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) print(len(dev_fitems)) elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(test_fitems)) print("Forward size:", forward_size) if tag == 'train': cur_train_eval_results_list_out = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_train_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) elif tag == 'dev': cur_dev_eval_results_list_out = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_dev_eval_results_list_out, f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl") elif tag == 'test': cur_test_eval_results_list_out = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_test_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl" ) if tag == 'train' or tag == 'test': exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] common.save_json( logging_item, f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 128 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 2000 pos_ratio = 0.2 do_lower_case = True document_top_k = 2 experiment_name = f'hotpot_v0_slevel_retri_(doc_top_k:{document_top_k})' debug_mode = False do_ema = True # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) # train_fitems = sentence_level_sampler.get_train_sentence_pair(document_top_k, True, debug_mode) # dev_fitems = sentence_level_sampler.get_dev_sentence_pair(document_top_k, False, debug_mode) # Load train eval results list cur_train_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl" ) cur_dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True, debug_mode=debug_mode) dev_fitems = get_sentence_pair(document_top_k, dev_list, cur_dev_eval_results_list, is_training=False, debug_mode=debug_mode) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(train_fitems) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=1) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) random.shuffle(train_fitems) train_instance = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr( model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) # print(logging_item) if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v02_f1({v02_sp_f1})|v02_recall({v02_sp_recall})' \ f'|v05_f1({v05_sp_f1})|v05_recall({v05_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: ema_model = ema.get_inference_model() master_device_num = 1 ema_inference_device_ids = get_ema_gpu_id_list( master_device_num=master_device_num) ema_model = ema_model.to(master_device_num) ema_model = torch.nn.DataParallel( ema_model, device_ids=ema_inference_device_ids) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(ema_model, dev_iter, master_device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval( cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval( cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'label': 'ema', 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) if not debug_mode: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|v02_f1({v02_sp_f1})|v02_recall({v02_sp_recall})' \ f'|v05_f1({v05_sp_f1})|v05_recall({v05_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'hotpot_v0_cs' lazy = False # lazy = True forward_size = 16 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 5000 pos_ratio = 0.2 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(sampled_train_list) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_instance = bert_cs_reader.read(sampled_train_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) # top5_doc_f1, top5_UB_sp_f1, top10_doc_f1, top10_Ub_sp_f1 # top5_doc_f1 = metrics_top5['doc_f1'] # top5_UB_sp_f1 = metrics_top5_UB['sp_f1'] # top10_doc_f1 = metrics_top10['doc_f1'] # top10_Ub_sp_f1 = metrics_top10_UB['sp_f1'] top5_doc_recall = metrics_top5['doc_recall'] top5_UB_sp_recall = metrics_top5_UB['sp_recall'] top10_doc_recall = metrics_top10['doc_recall'] top10_Ub_sp_recall = metrics_top10_UB['sp_recall'] logging_item = { 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } # print(logging_item) save_file_name = f'i({update_step})|e({epoch_i})' \ f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \ f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def init_results_v8(data_list, gt_data_list, terms_based_resutls, g_score_dict, match_filtering_k=3, term_retrieval_top_k=5, multihop_retrieval_top_k=None): # 2019-04-06 # The complete v7 version of retrieval ner_set = get_title_entity_set() # dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print("Total data length:") print(len(data_list)) # We load term-based results print("Load term-based results.") terms_based_results_dict = dict() for item in terms_based_resutls: terms_based_results_dict[item['qid']] = item # Load tf-idf_score function: # g_score_dict = dict() # load_from_file(g_score_dict, # config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) # for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) doc_pred_dict = {'sp_doc': dict(), 'raw_retrieval_set': dict()} # doc_pred_dict_p1 = {'sp_doc': dict(), 'raw_retrieval_set': dict()} for item in tqdm(data_list): question = item['question'] qid = item['_id'] query_terms = get_query_ngrams(question) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # This method will add the keyword match results in-place to retrieved_set. get_kw_matching_results(question, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) # Then we add term-based matching results added_count = 0 for score, title in sorted(terms_based_results_dict[qid]['doc_list'], key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id(title): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break # Add hyperlinked pages: finded_keys_set = set( retrieved_set.to_id_list() ) # for finding hyperlinked pages we do for both keyword matching and disambiguration group. # .3 We then add some hyperlinked title db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB) for keyword_group in finded_keys_set: flatten_hyperlinks = [] hyperlinks = wiki_db_tool.get_first_paragraph_hyperlinks( db_cursor, keyword_group) for hls in hyperlinks: flatten_hyperlinks.extend(hls) for hl in flatten_hyperlinks: potential_title = hl.href if potential_title in ner_set and not filter_word( potential_title) and not filter_document_id( potential_title ): # important bug fixing 'or' to 'and' # hyperlinked_title.append(potential_title) # if not filter_document_id(potential_title): score = get_query_doc_score(valid_query_terms, potential_title, g_score_dict) retrieved_set.add_item( retrieval_utils.RetrievedItem(potential_title, 'kwm_disamb_hlinked')) retrieved_set.score_item(potential_title, score, namespace=keyword_group + '-2-hop') for keyword_group in finded_keys_set: retrieved_set.sort_and_filter(keyword_group + '-2-hop', top_k=multihop_retrieval_top_k) doc_pred_dict['sp_doc'][qid] = retrieved_set.to_id_list() doc_pred_dict['raw_retrieval_set'][qid] = retrieved_set if gt_data_list is not None: ext_hotpot_eval.eval(doc_pred_dict, gt_data_list) return doc_pred_dict
print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) # analysis old: # doc_results = common.load_json(config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") # doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # # # doc_results = common.load_json(config.RESULT_PATH / "doc_retri_results/doc_retrieval_debug_v7/doc_raw_matching_with_disamb_with_hyperlinked_v7_file_pipeline_top_none_redo_0.json") # # doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # # len_list = [] # for rset in doc_results['sp_doc'].values(): # len_list.append(len(rset)) # # print("Results with filtering:") # # print(collections.Counter(len_list).most_common(10000))
def doc_retrie_v5_reimpl_tf_idf_upperbound(): top_k = 10 dev_fullwiki = common.load_json(config.DEV_FULLWIKI_FILE) pred_dev = common.load_json( # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_with_hyperlinked_v5_file.json") # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_file.json") config.RESULT_PATH / "doc_retri_results/doc_retrieval_debug_v6/doc_raw_matching_with_disamb_withiout_hyperlinked_v6_file_debug_4.json") # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_withiout_hyperlinked_v5_file.json") tf_idf_dev_results = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") tf_idf_scored_dict = dict() for item in tf_idf_dev_results: sorted_scored_list = sorted(item['doc_list'], key=lambda x: x[0], reverse=True) pred_list = [docid for _, docid in sorted_scored_list[:top_k]] qid = item['qid'] tf_idf_scored_dict[qid] = pred_list pred_v5_sp_doc = pred_dev['sp_doc'] # dev_fullwiki = common.load_json(config.DEV_DISTRACTOR_FILE) upperbound_pred_file = dict() upperbound_pred_file['sp'] = dict() upperbound_pred_file['sp_doc'] = dict() upperbound_pred_file['p_answer'] = dict() # print(dev_fullwiki for item in dev_fullwiki: qid = item['_id'] answer = item['answer'] contexts = item['context'] supporting_facts = item['supporting_facts'] tf_idf_docs = tf_idf_scored_dict[qid] v5_retrieved_doc = pred_v5_sp_doc[qid] # print(v5_retrieved_doc) supporting_doc = set([fact[0] for fact in item['supporting_facts']]) # retrieved_doc_dict = set([context[0] for context in contexts]) retrieved_doc_dict = dict() for doc_title, context_sents in contexts: if doc_title not in retrieved_doc_dict: retrieved_doc_dict[doc_title] = dict() for i, sent in enumerate(context_sents): retrieved_doc_dict[doc_title][i] = sent upperbound_pred_doc = [] upperbound_pred_sp = [] found_answer = False for sp_doc in tf_idf_docs: if sp_doc in supporting_doc: upperbound_pred_doc.append(sp_doc) for gt_sp_doc, sp_fact_line_num in supporting_facts: if gt_sp_doc == sp_doc: upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: found_answer = True for sp_doc in v5_retrieved_doc: if sp_doc not in upperbound_pred_doc: if sp_doc in supporting_doc: upperbound_pred_doc.append(sp_doc) for gt_sp_doc, sp_fact_line_num in supporting_facts: if gt_sp_doc == sp_doc: upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: found_answer = True # upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: # found_answer = True p_answer = answer if found_answer else "" upperbound_pred_file['sp'][qid] = upperbound_pred_sp upperbound_pred_file['sp_doc'][qid] = upperbound_pred_doc upperbound_pred_file['p_answer'][qid] = p_answer if all([gt_fact in upperbound_pred_sp for gt_fact in supporting_facts]): # If we find all the evidence, to add additional yes/no answer. upperbound_pred_file['p_answer'][qid] = answer ext_hotpot_eval.eval(upperbound_pred_file, dev_fullwiki)
print(Counter(len_list).most_common(10000)) # exit(0) # print() # print(len(rset)) # pred_dev = common.load_json(config.RESULT_PATH / "doc_retri_results/toy_doc_rm_stopword_pred_file.json") # pred_dev = common.load_json(config.RESULT_PATH / "doc_retriesults/toy_doc_rm_stopword_pred_file.json") print(len(pred_dev)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list)) dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) global_score_tracker, metric = ext_hotpot_eval.eval( pred_dev, dev_fullwiki_list) print(metric) filter_analysis(global_score_tracker, sp_doc_analysis, max_count=25, show_info=[ 'question', 'answer', 'sp_doc', 'supporting_facts', 'doc_recall', 'doc_prec', 'type', 'raw_retrieval_set' ], additional_item=pred_dev) # counter_analysis(global_score_tracker) # for key, value in global_score_tracker.items():
def eval_hotpot_procedure(biterator, dev_instances, model, device_num, ema_device_num, dev_list, dev_o_dict, debug_mode, logging_agent, update_step, epoch_i, file_path_prefix, do_ema, ema, seed): print("Eval HOTPOT!") dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) top5_doc_recall = metrics_top5['doc_recall'] top5_UB_sp_recall = metrics_top5_UB['sp_recall'] top10_doc_recall = metrics_top10['doc_recall'] top10_Ub_sp_recall = metrics_top10_UB['sp_recall'] logging_item = { 'step:': update_step, 'epoch': epoch_i, 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, 'time': str(datetime.datetime.now()) } print(logging_item) if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \ f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: ema_model = ema.get_inference_model() master_device_num = ema_device_num ema_inference_device_ids = get_ema_gpu_id_list( master_device_num=master_device_num) ema_model = ema_model.to(master_device_num) ema_model = torch.nn.DataParallel(ema_model, device_ids=ema_inference_device_ids) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(ema_model, dev_iter, master_device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval( upperbound_results_dict_top10, dev_list, verbose=False) top5_doc_recall = metrics_top5['doc_recall'] top5_UB_sp_recall = metrics_top5_UB['sp_recall'] top10_doc_recall = metrics_top10['doc_recall'] top10_Ub_sp_recall = metrics_top10_UB['sp_recall'] logging_item = { 'label': 'ema', 'step:': update_step, 'epoch': epoch_i, 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, 'time': str(datetime.datetime.now()) } print(logging_item) if not debug_mode: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \ f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def eval_model_for_downstream(model_saved_path): seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True forward_size = 32 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") test_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # dev_instances = bert_cs_reader.read(dev_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) test_instances = bert_cs_reader.read(test_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # train_iter = biterator(train_instance, num_epochs=1, shuffle=False) # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(dev_fitems_list)) print(len(test_fitems_list)) print(len(train_fitems_list)) # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl") print("Test write finished.") exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_3 cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3) upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top3, dev_list) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False) _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) logging_item = { 'top3': metrics_top3, 'top3_UB': metrics_top3_UB, 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } print(logging_item) common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl") common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
if __name__ == '__main__': pred_dev_a = common.load_json( config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_with_hyperlinked_v2_file.json" ) pred_dev_b = common.load_json( config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_with_hyperlinked_v3_file.json" ) all_ids = pred_dev_a['sp_doc'].keys() dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) global_score_tracker_a, metric = ext_hotpot_eval.eval( pred_dev_a, dev_fullwiki_list) global_score_tracker_b, metric = ext_hotpot_eval.eval( pred_dev_b, dev_fullwiki_list) print(global_score_tracker_a.keys()) for key in all_ids: scored_item_a = global_score_tracker_a[key] scored_item_b = global_score_tracker_b[key] # print(scored_item_a.keys()) if scored_item_a['doc_recall'] != scored_item_b['doc_recall']: print(scored_item_a['question']) print(scored_item_a['doc_recall'], scored_item_b['doc_recall']) print(pred_dev_a['raw_retrieval_set'][key]) print(pred_dev_b['raw_retrieval_set'][key]) print(scored_item_a['supporting_facts']) print(pred_dev_a['sp_doc'][key])