def get_sentence_pair(top_k, d_list, p_level_results_list, is_training, debug_mode=False): # t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) # # dev_list = common.load_json(config.DEV_FULLWIKI_FILE) # dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_list = d_list # cur_dev_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_document_level/2019_4_17/dev_p_level_bert_v1_results.jsonl") cur_dev_eval_results_list = p_level_results_list if debug_mode: dev_list = dev_list[:100] id_set = set([item['_id'] for item in dev_list]) cur_dev_eval_results_list = [item for item in p_level_results_list if item['qid'] in id_set] dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_top2 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=top_k, filter_value=None) # print(cur_results_dict_top2) fitems = build_sentence_forward_item(cur_results_dict_top2, dev_list, is_training=is_training, db_cursor=t_db_cursor) return fitems
def get_paragraph_forward_pair(tag, ruleterm_doc_results, is_training, debug=False, ignore_non_verifiable=False): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] ruleterm_doc_results = ruleterm_doc_results[:100] ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict( ruleterm_doc_results, 'id') db_cursor = fever_db.get_cursor() fitems = build_full_wiki_document_forward_item(ruleterm_doc_results_dict, d_list, is_training, db_cursor, ignore_non_verifiable) return fitems
def get_sentences(tag, is_training, debug=False): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: # d_list = d_list[:10] d_list = d_list[:50] # d_list = d_list[:200] doc_results = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl" ) doc_results_dict = list_dict_data_tool.list_to_dict(doc_results, 'id') fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_full_wiki_document_forward_item( doc_results_dict, d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items
def get_inference_pair(tag, is_training, sent_result_path, debug_num=None, evidence_filtering_threshold=0.01): # sent_result_path = "" if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug_num is not None: # d_list = d_list[:10] d_list = d_list[:50] # d_list = d_list[:200] d_dict = list_dict_data_tool.list_to_dict(d_list, 'id') threshold_value = evidence_filtering_threshold # sent_list = common.load_jsonl( # config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl") # sent_list = common.load_jsonl( # config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json") # debug_num = None if not debug else 2971 # debug_num = None if isinstance(sent_result_path, Path): sent_list = common.load_jsonl(sent_result_path, debug_num) elif isinstance(sent_result_path, list): sent_list = sent_result_path else: raise ValueError( f"{sent_result_path} is not of a valid argument type which should be [list, Path]." ) list_dict_data_tool.append_subfield_from_list_to_dict(sent_list, d_dict, 'oid', 'fid', check=True) filltered_sent_dict = select_top_k_and_to_results_dict( d_dict, top_k=5, threshold=threshold_value) list_dict_data_tool.append_item_from_dict_to_list( d_list, filltered_sent_dict, 'id', ['predicted_evidence', 'predicted_scored_evidence']) fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_nli_forward_item(d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items, d_list
def get_nli_pair(tag, is_training, sent_level_results_list, debug=None, sent_top_k=5, sent_filter_value=0.05): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] # sent_dict = list_dict_data_tool.list_to_dict(sent_level_results_list): d_dict = list_dict_data_tool.list_to_dict(d_list, 'id') if debug: id_set = set([item['id'] for item in d_list]) new_sent_list = [] for item in sent_level_results_list: if item["qid"] in id_set: new_sent_list.append(item) sent_level_results_list = new_sent_list list_dict_data_tool.append_subfield_from_list_to_dict( sent_level_results_list, d_dict, 'qid', 'fid', check=True) filltered_sent_dict = select_top_k_and_to_results_dict( d_dict, score_field_name='prob', top_k=sent_top_k, filter_value=sent_filter_value, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( d_list, filltered_sent_dict, 'id', ['predicted_evidence', 'selected_scored_results']) fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_nli_forward_item(d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items, d_list
def eval_ensemble(): sent_file = config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl" dev_sent_filtering_prob = 0.01 tag = 'dev' top_k = 5 # dev_list = common.load_jsonl(config.FEVER_DEV) dev_sent_results_list = common.load_jsonl(sent_file) dev_fitems, dev_list = get_nli_pair( tag, is_training=False, sent_level_results_list=dev_sent_results_list, debug=False, sent_top_k=top_k, sent_filter_value=dev_sent_filtering_prob) pred_file_list = [ config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_label_results_th0.2.jsonl", config.PRO_ROOT / "data/p_fever/fever_nli/04-26-10:15:39_fever_v2_nli_th0.2/ema_i(14000)|e(2)|ss(0.6991199119911992)|ac(0.7492249224922493)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl", config.PRO_ROOT / "data/p_fever/fever_nli/04-27-10:03:27_fever_v2_nli_th0.2/ema_i(26000)|e(3)|ss(0.6958695869586958)|ac(0.7447744774477447)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl", ] pred_d_list = [common.load_jsonl(file) for file in pred_file_list] final_list = ensemble_nli_results(pred_d_list) pred_list = final_list ema_results_dict = list_dict_data_tool.list_to_dict(pred_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') dev_list = common.load_jsonl(config.FEVER_DEV) mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item)
def inspect_upstream_eval(): dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 # cur_results_dict_v05 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5, # score_field_name='prob', # filter_value=0.5, # result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') # _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'label': 'ema', 'v02': metrics_v2, # 'v05': metrics_v5, } print(logging_item)
def eval_hotpot_s(): cur_dev_eval_results_list_out = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpot_p_level_effects/hotpot_s_level_dev_results_top_k_doc_100.jsonl" ) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') # cur_results_dict_v02 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5, # score_field_name='prob', # filter_value=0.2, # result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) # _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { # 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] print(em, pr, rec, f1)
def get_sentence_forward_pair(tag, ruleterm_doc_results, is_training, debug=False, ignore_non_verifiable=False, top_k=5, filter_value=0.005): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] ruleterm_doc_results = ruleterm_doc_results[:100] # ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(ruleterm_doc_results, 'id') d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'id') copied_d_o_dict = copy.deepcopy(d_o_dict) # copied_d_list = copy.deepcopy(d_list) list_dict_data_tool.append_subfield_from_list_to_dict(ruleterm_doc_results, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_filtered = select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) db_cursor = fever_db.get_cursor() fitems = build_full_wiki_sentence_forward_item(cur_results_dict_filtered, d_list, is_training, db_cursor, ignore_non_verifiable) return fitems
def eitems_to_fitems(eitem_list, tokenizer, is_training, max_tokens_for_doc=320, max_query_length=64, doc_stride=128, debug=False): fitem_list = [] # The output of all fitems if debug: eitem_list = eitem_list[:100] for item in tqdm(eitem_list): f_items = preprocssing_span_prediction_item_paired( item, tokenizer, is_training, max_tokens_for_doc, max_query_length, doc_stride) fitem_list.extend(f_items) fitem_dict = list_dict_data_tool.list_to_dict(fitem_list, 'fid') return fitem_dict, fitem_list
def get_open_qa_item_with_upstream_paragraphs(d_list, cur_eval_results_list, is_training, tokenizer: BertTokenizer, max_context_length, max_query_length, doc_stride=128, debug_mode=False, top_k=10, filter_value=0.1, match_type='string'): t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT) if debug_mode: d_list = d_list[:100] id_set = set([item['question'] for item in d_list]) cur_eval_results_list = [ item for item in cur_eval_results_list if item['qid'] in id_set ] d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question') copied_d_o_dict = copy.deepcopy(d_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) forward_example_items = build_open_qa_forword_item(cur_results_dict_top10, d_list, is_training, t_cursor, match_type) forward_example_items = format_convert(forward_example_items, is_training) fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems( forward_example_items, tokenizer, is_training, max_context_length, max_query_length, doc_stride, False) return fitems_dict, read_fitems_list, cur_results_dict_top10['pred_p_list']
def eval_p_level(): cur_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) print(metrics_top5)
def get_qa_item_with_upstream_sentence(d_list, sentence_level_results, is_training, tokenizer: BertTokenizer, max_context_length, max_query_length, doc_stride=128, debug_mode=False, top_k=5, filter_value=0.2): t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) if debug_mode: d_list = d_list[:100] id_set = set([item['_id'] for item in d_list]) sentence_level_results = [ item for item in sentence_level_results if item['qid'] in id_set ] d_o_dict = list_dict_data_tool.list_to_dict(d_list, '_id') copied_d_o_dict = copy.deepcopy(d_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( sentence_level_results, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict = select_top_k_and_to_results_dict( copied_d_o_dict, top_k=top_k, score_field_name='prob', filter_value=filter_value, result_field='sp') forward_example_items = build_qa_forword_item(cur_results_dict, d_list, is_training, t_db_cursor) forward_example_items = format_convert(forward_example_items, is_training) fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems( forward_example_items, tokenizer, is_training, max_context_length, max_query_length, doc_stride, False) return fitems_dict, read_fitems_list, cur_results_dict['sp']
def evidence_adjustment(tag, sent_file, label_file, filter_prob=0.2, top_k=5): dev_sent_filtering_prob = filter_prob # dev_list = common.load_jsonl(config.FEVER_DEV) dev_sent_results_list = common.load_jsonl(sent_file) dev_fitems, dev_list = get_nli_pair( tag, is_training=False, sent_level_results_list=dev_sent_results_list, debug=False, sent_top_k=top_k, sent_filter_value=dev_sent_filtering_prob) cur_eval_results_list = common.load_jsonl(label_file) ema_results_dict = list_dict_data_tool.list_to_dict( cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') mode = {'standard': True} # delete_unused_evidence(copied_dev_list) strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item)
def model_perf(dataset_name, task_name, data_file, model_prediction_file): d_list = common.load_jsonl(data_file) collected_data_dict = list_dict_data_tool.list_to_dict(d_list, key_fields='uid') model_prediction_dict = common.load_json(model_prediction_file) results_dict, all_correct_set = calculate_divergence_bwt_model_human_simplify( collected_data_dict, model_prediction_dict, task_name) print('-' * 60) print('Data:', dataset_name) print("All Correct Count:", len(all_correct_set)) print('\t'.join([ '{:20s}'.format('Model Name'), '{:10s}'.format('JSD'), '{:10s}'.format('KL'), '{:10s}'.format('Old Acc.'), '{:10s}'.format('New Acc.') ])) for model_name, model_item in results_dict.items(): print('\t'.join([ '{:20s}'.format(model_name), '{:10s}'.format(format_number(model_item['average JS div'])), '{:10s}'.format(format_number(model_item['average KL div'])), '{:10s}'.format(format_number(model_item['o_acc'])), '{:10s}'.format(format_number(model_item['m_acc'])), ])) print('-' * 60)
def eval_model_for_downstream(model_saved_path): bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 64 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False max_l = 264 # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps tag = 'test' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_ruleterm_doc_results = common.load_jsonl( # config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl") # dev_ruleterm_doc_results = train_ruleterm_doc_results if tag == 'dev': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl" ) dev_list = common.load_jsonl(config.FEVER_DEV) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'dev', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) elif tag == 'train': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl" ) dev_list = common.load_jsonl(config.FEVER_TRAIN) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'train', dev_ruleterm_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=False) elif tag == 'test': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_test.jsonl" ) dev_list = common.load_jsonl(config.FEVER_TEST) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'test', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) else: raise NotImplemented() # dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair('train', dev_ruleterm_doc_results, # is_training=True, debug=debug_mode, # ignore_non_verifiable=False) # Just to show the information fever_p_level_sampler.down_sample_neg(dev_fitems, None) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if debug_mode: num_train_optimization_steps = 100 dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_p_level_{tag}_results.jsonl") if tag == 'test': exit(0) # common.save_jsonl(cur_eval_results_list, "fever_p_level_train_results_1.jsonl") copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.1) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_01 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th00_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.01) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th00_1, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_001 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th000_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.005) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th000_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_0005 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_0005': score_0005, 'score_001': score_001, 'score_01': score_01, 'score_02': score_02, 'score_05': score_05, } print(json.dumps(logging_item, indent=2))
def eval_model_for_downstream(model_saved_path): seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True forward_size = 32 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") test_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # dev_instances = bert_cs_reader.read(dev_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) test_instances = bert_cs_reader.read(test_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # train_iter = biterator(train_instance, num_epochs=1, shuffle=False) # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(dev_fitems_list)) print(len(test_fitems_list)) print(len(train_fitems_list)) # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl") print("Test write finished.") exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_3 cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3) upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top3, dev_list) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False) _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) logging_item = { 'top3': metrics_top3, 'top3_UB': metrics_top3_UB, 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } print(logging_item) common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl") common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'hotpot_v0_cs' lazy = False # lazy = True forward_size = 16 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 5000 pos_ratio = 0.2 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(sampled_train_list) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_instance = bert_cs_reader.read(sampled_train_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) # top5_doc_f1, top5_UB_sp_f1, top10_doc_f1, top10_Ub_sp_f1 # top5_doc_f1 = metrics_top5['doc_f1'] # top5_UB_sp_f1 = metrics_top5_UB['sp_f1'] # top10_doc_f1 = metrics_top10['doc_f1'] # top10_Ub_sp_f1 = metrics_top10_UB['sp_f1'] top5_doc_recall = metrics_top5['doc_recall'] top5_UB_sp_recall = metrics_top5_UB['sp_recall'] top10_doc_recall = metrics_top10['doc_recall'] top10_Ub_sp_recall = metrics_top10_UB['sp_recall'] logging_item = { 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } # print(logging_item) save_file_name = f'i({update_step})|e({epoch_i})' \ f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \ f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
random.shuffle(pos_items) random.shuffle(neg_items) neg_sample_count = int(pos_count / ratio) sampled_neg = neg_items[:neg_sample_count] print(f"After Sampling, we have {pos_count}/{len(sampled_neg)} (pos/neg).") sampled_list = sampled_neg + pos_items random.shuffle(sampled_list) return sampled_list if __name__ == '__main__': d_list = common.load_jsonl(config.FEVER_DEV) doc_results = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl") doc_results_dict = list_dict_data_tool.list_to_dict(doc_results, 'id') fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_full_wiki_document_forward_item( doc_results_dict, d_list, is_training=False, db_cursor=fever_db_cursor) # print(forward_items) # for item in forward_items: # if item['s_labels'] == 'true': # print(item['query'], item['context'], item['sid'], item['cid'], item['fid'], item['s_labels']) print(len(forward_items)) # down_sample_neg(forward_items, ratio=0.2)
def model_go(sent_filter_value, sent_top_k=5): seed = 12 torch.manual_seed(seed) bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = "bert-base-uncased" lazy = False forward_size = 32 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_rate = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 1000 do_lower_case = True debug = False max_pre_context_length = 320 max_query_length = 64 doc_stride = 128 qa_num_of_layer = 2 do_ema = True ema_device_num = 1 # s_filter_value = 0.5 s_filter_value = sent_filter_value # s_top_k = 5 s_top_k = sent_top_k experiment_name = f'hotpot_v0_qa_(s_top_k:{s_top_k},s_fv:{s_filter_value},qa_layer:{qa_num_of_layer})' print("Potential total length:", max_pre_context_length + max_query_length + 3) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) # Load Dataset. dev_list = common.load_json(config.DEV_FULLWIKI_FILE) train_list = common.load_json(config.TRAIN_FILE) dev_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) train_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) dev_fitem_dict, dev_fitem_list, dev_sp_results_dict = get_qa_item_with_upstream_sentence( dev_list, dev_sentence_level_results, is_training=False, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence( train_list, train_sentence_level_results, is_training=True, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) # print(len(dev_fitem_list)) # print(len(dev_fitem_dict)) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') if debug: dev_list = dev_list[:100] eval_frequency = 2 est_datasize = len(train_fitem_list) span_pred_reader = BertPairedSpanPredReader(bert_tokenizer=tokenizer, lazy=lazy, example_filter=None) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertSpan(bert_encoder, qa_num_of_layer) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=ema_device_num) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) iterator = BasicIterator(batch_size=batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] print("Total train instances:", len(train_fitem_list)) num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_rate, t_total=num_train_optimization_steps) dev_instances = span_pred_reader.read(dev_fitem_list) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix = None if not debug: file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) print("Resampling:") train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence( train_list, train_sentence_level_results, is_training=True, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) random.shuffle(train_fitem_list) train_instances = span_pred_reader.read(train_fitem_list) train_iter = iterator(train_instances, num_epochs=1, shuffle=True) for batch in tqdm(train_iter, desc="Batch Loop"): model.train() batch = allen_util.move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) gt_span = batch['gt_span'] loss = model(mode=BertSpan.ForwardMode.TRAIN, input_ids=paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, gt_span=gt_span) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr( model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: # print("Non-EMA EVAL:") # eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False) # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict, # device_num) # cur_results_dict = dict() # cur_results_dict['p_answer'] = cur_eval_dict # cur_results_dict['sp'] = dev_sp_results_dict # # _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) # # print(metrics) # # logging_item = { # 'score': metrics, # } # # joint_f1 = metrics['joint_f1'] # joint_em = metrics['joint_em'] # # print(logging_item) # # if not debug: # save_file_name = f'i({update_step})|e({epoch_i})' \ # f'|j_f1({joint_f1})|j_em({joint_em})|seed({seed})' # # # print(save_file_name) # logging_agent.incorporate_results({}, save_file_name, logging_item) # logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") # # model_to_save = model.module if hasattr(model, 'module') else model # output_model_file = Path(file_path_prefix) / save_file_name # torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: print("EMA EVAL") ema_model = ema.get_inference_model() ema_inference_device_ids = get_ema_gpu_id_list( master_device_num=ema_device_num) ema_model = ema_model.to(ema_device_num) ema_model = torch.nn.DataParallel( ema_model, device_ids=ema_inference_device_ids) dev_iter = iterator(dev_instances, num_epochs=1, shuffle=False) cur_eitem_list, cur_eval_dict = span_eval( ema_model, dev_iter, do_lower_case, dev_fitem_dict, ema_device_num, show_progress=False) cur_results_dict = dict() cur_results_dict['p_answer'] = cur_eval_dict cur_results_dict['sp'] = dev_sp_results_dict _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) print(metrics) print("---------------" * 3) logging_item = { 'label': 'ema', 'score': metrics, } joint_f1 = metrics['joint_f1'] joint_em = metrics['joint_em'] print(logging_item) if not debug: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|j_f1({joint_f1})|j_em({joint_em})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'): # term_retrieval_top_k = 20 # term_retrieval_top_k = 20 # term_retrieval_top_k = 3 # match_filtering_k = 2 if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") d_tf_idf = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl" ) tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id') r_list = [] ner_set = get_title_entity_set() g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) for item in tqdm(d_list): cur_id = str(item['id']) query = item['claim'] query_terms = get_query_ngrams(query) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # print(tf_idf_doc_list) get_kw_matching_results(query, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list'] added_count = 0 for score, title in sorted(tf_idf_doc_list, key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id( title) and not title.startswith('List of '): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break predicted_docids = retrieved_set.to_id_list() # print(retrieved_set) # print(item['claim'], predicted_docids) r_item = dict() r_item['id'] = int(cur_id) r_item['claim'] = item['claim'] r_item['predicted_docids'] = predicted_docids if tag != 'test': r_item['label'] = item['label'] r_list.append(r_item) # r_list = common.load_jsonl('dev-debug.jsonl') # We need to modify the existing retrieved document for naming consistency for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # Modify finished # print(r_list[0:10]) len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list)) common.save_jsonl( r_list, f'fever_term_based_retri_results_' f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) mode = {'standard': False, 'check_doc_id_correct': True} # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
def merge_results_with_haonao_module(term_retrieval_top_k=3, match_filtering_k=2, haonan_topk=10, tag='dev', save=False): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) task_name = 'shared_task_dev' elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) task_name = 'train' elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) task_name = 'shared_task_test' else: raise ValueError(f"Tag:{tag} not supported.") # r_list = common.load_jsonl(config.RESULT_PATH / f'doc_retri_results/fever_results/standard_term_based_results/' # f'fever_term_based_retri_results_{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl') r_list = common.load_jsonl( config.RESULT_PATH / f'doc_retri_results/fever_results/standard_term_based_results/' f'fever_term_based_retri_results_{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) old_result_list = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/fever_results/haonans_results/dr_{tag}.jsonl") item_resorting(old_result_list, top_k=haonan_topk) old_result_dict = list_dict_data_tool.list_to_dict(old_result_list, 'id') for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # item['predicted_docids'] = [] merged_result_list = [] for item in tqdm(r_list): cur_id = int(item['id']) old_retrieval_doc = old_result_dict[cur_id]['predicted_docids'] new_retrieval_doc = item['predicted_docids'] m_predicted_docids = set.union(set(old_retrieval_doc), set(new_retrieval_doc)) # print(m_predicted_docids) m_predicted_docids = [ docid for docid in m_predicted_docids if not docid.startswith('List_of_') ] item['predicted_docids'] = list(m_predicted_docids) # print(item['predicted_docids']) mode = {'standard': False, 'check_doc_id_correct': True} if tag != 'test': fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) if save: print("Saved to:") common.save_jsonl( r_list, config.RESULT_PATH / f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl" ) # States information. len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list))
def eval_trainset_for_train_nli(model_path): tag = 'test' is_training = False seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 128 # batch_size = 64 # batch_size = 192 batch_size = 128 do_lower_case = True debug_mode = False # debug_mode = True num_class = 1 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_fitems_list = get_sentences(tag, is_training=is_training, debug=debug_mode) est_datasize = len(train_fitems_list) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_path)) print("Estimated training size", est_datasize) print("Estimated forward steps:", est_datasize / forward_size) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, make_int=True, show_progress=True) if debug_mode: train_list = common.load_jsonl(config.FEVER_TRAIN) train_list = train_list[:50] set_gt_nli_label(train_list) train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id') copied_dev_o_dict = copy.deepcopy(train_o_dict) copied_dev_d_list = copy.deepcopy(train_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True) print("Threshold 0.5:") cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, threshold=0.1) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') mode = {'standard': True, 'check_sent_id_correct': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_d_list, train_list, mode=mode, max_evidence=5) print(strict_score, acc_score, pr, rec, f1) common.save_jsonl(cur_eval_results_list, f'{tag}_sent_results_labeled:{is_training}.jsonl')
def eval_model_for_downstream_ablation(model_saved_path, top_k_doc): bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 128 # batch_size = 64 # batch_size = 128 do_lower_case = True debug_mode = False max_l = 128 # est_datasize = 900_000 tag = 'dev' num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl" ) dev_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl" ) test_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_test_results.jsonl" ) train_list = common.load_jsonl(config.FEVER_TRAIN) dev_list = common.load_jsonl(config.FEVER_DEV) test_list = common.load_jsonl(config.FEVER_TEST) # dev_list = common.load_jsonl(config.FEVER_DEV) if tag == 'dev': dev_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'dev', dev_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(dev_fitems, None) elif tag == 'train': train_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'train', train_upstream_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(train_fitems, None) elif tag == 'test': test_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'test', test_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(test_fitems, None) # Just to show the information if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') test_o_dict = list_dict_data_tool.list_to_dict(test_list, 'id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl( cur_eval_results_list, f"fever_s_level_{tag}_results_top_k_doc_{top_k_doc}.jsonl") copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'top_k_doc': top_k_doc, 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print("Top_k doc:", top_k_doc) print(score_05) common.save_json( score_05, f"top_k_doc:{top_k_doc}_ss:{strict_score}_pr:{pr}_rec:{rec}_f1:{f1}" ) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_s_level_{tag}_results.jsonl") # copied_test_o_dict = copy.deepcopy(test_o_dict) # copied_test_d_list = copy.deepcopy(test_list) # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_test_o_dict, # 'qid', 'fid', check=True) # # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_test_o_dict, # score_field_name='prob', # top_k=5, filter_value=0.5, # result_field='predicted_evidence') # # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_test_d_list, # cur_results_dict_th0_5, # 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} # copied_train_o_dict = copy.deepcopy(train_o_dict) # copied_train_d_list = copy.deepcopy(train_list) # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_train_o_dict, # 'qid', 'fid', check=True) # # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_train_o_dict, # score_field_name='prob', # top_k=5, filter_value=0.5, # result_field='predicted_evidence') # # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_train_d_list, # cur_results_dict_th0_5, # 'id', 'predicted_evidence') # # mode = {'standard': False, 'check_doc_id_correct': True} # strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(copied_train_d_list, train_list, # max_evidence=5) # score_05 = { # 'ss': strict_score, # 'pr': pr, 'rec': rec, 'f1': f1, # } # # print(score_05) elif tag == 'train': train_instances = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, train_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_s_level_{tag}_results.jsonl") copied_train_o_dict = copy.deepcopy(train_o_dict) copied_train_d_list = copy.deepcopy(train_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_train_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_train_o_dict, score_field_name='prob', top_k=5, filter_value=0.5, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_train_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_train_d_list, train_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(score_05)
def multitask_model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 64 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 1 eval_frequency = 5000 hotpot_pos_ratio = 0.2 do_lower_case = True max_l = 264 experiment_name = f'mtr_p_level_(num_train_epochs:{num_train_epochs})' debug_mode = False do_ema = True # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Hotpot Dataset hotpot_train_list = common.load_json(config.TRAIN_FILE) hotpot_dev_list = common.load_json(config.DEV_FULLWIKI_FILE) hotpot_dev_o_dict = list_dict_data_tool.list_to_dict( hotpot_dev_list, '_id') # Load Hotpot upstream paragraph forward item hotpot_dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") hotpot_train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") hotpot_train_fitems_list = hotpot_sampler_utils.field_name_convert( hotpot_train_fitems_list, 'doc_t', 'element') hotpot_dev_fitems_list = hotpot_sampler_utils.field_name_convert( hotpot_dev_fitems_list, 'doc_t', 'element') # Load FEVER Dataset # fever_train_list = common.load_json(config.FEVER_TRAIN) fever_dev_list = common.load_jsonl(config.FEVER_DEV) fever_dev_o_dict = list_dict_data_tool.list_to_dict(fever_dev_list, 'id') train_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl" ) dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl" ) fever_train_fitems_list = fever_p_level_sampler.get_paragraph_forward_pair( 'train', train_ruleterm_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=True) fever_dev_fitems_list = fever_p_level_sampler.get_paragraph_forward_pair( 'dev', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) if debug_mode: hotpot_dev_list = hotpot_dev_list[:10] hotpot_dev_fitems_list = hotpot_dev_fitems_list[:296] hotpot_train_fitems_list = hotpot_train_fitems_list[:300] fever_dev_list = fever_dev_list[:100] eval_frequency = 2 # Down_sample for hotpot. hotpot_sampled_train_list = down_sample_neg(hotpot_train_fitems_list, ratio=hotpot_pos_ratio) hotpot_est_datasize = len(hotpot_sampled_train_list) fever_est_datasize = len(fever_train_fitems_list) print("Hotpot Train Size:", hotpot_est_datasize) print("Fever Train Size:", fever_est_datasize) est_datasize = hotpot_est_datasize + fever_est_datasize bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=1) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) hotpot_dev_instances = bert_cs_reader.read(hotpot_dev_fitems_list) fever_dev_instances = bert_cs_reader.read(fever_dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) hotpot_sampled_train_list = down_sample_neg(hotpot_train_fitems_list, ratio=hotpot_pos_ratio) all_train_data = hotpot_sampled_train_list + fever_train_fitems_list random.shuffle(all_train_data) train_instance = bert_cs_reader.read(all_train_data) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr( model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) # Eval FEVER eval_fever_procedure(biterator, fever_dev_instances, model, device_num, 1, fever_dev_list, fever_dev_o_dict, debug_mode, logging_agent, update_step, epoch_i, file_path_prefix, do_ema, ema, seed) eval_hotpot_procedure(biterator, hotpot_dev_instances, model, device_num, 1, hotpot_dev_list, hotpot_dev_o_dict, debug_mode, logging_agent, update_step, epoch_i, file_path_prefix, do_ema, ema, seed) if not debug_mode: print("Final Saving.") save_file_name = f'i({update_step})|e({num_train_epochs})_final_model' model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: print("Final EMA Saving") ema_model = ema.get_inference_model() save_file_name = f'i({update_step})|e({num_train_epochs})_final_ema_model' model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 128 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 2000 pos_ratio = 0.2 do_lower_case = True document_top_k = 2 experiment_name = f'hotpot_v0_slevel_retri_(doc_top_k:{document_top_k})' debug_mode = False do_ema = True # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) # train_fitems = sentence_level_sampler.get_train_sentence_pair(document_top_k, True, debug_mode) # dev_fitems = sentence_level_sampler.get_dev_sentence_pair(document_top_k, False, debug_mode) # Load train eval results list cur_train_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl" ) cur_dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True, debug_mode=debug_mode) dev_fitems = get_sentence_pair(document_top_k, dev_list, cur_dev_eval_results_list, is_training=False, debug_mode=debug_mode) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(train_fitems) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=1) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) random.shuffle(train_fitems) train_instance = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr( model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) # print(logging_item) if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v02_f1({v02_sp_f1})|v02_recall({v02_sp_recall})' \ f'|v05_f1({v05_sp_f1})|v05_recall({v05_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: ema_model = ema.get_inference_model() master_device_num = 1 ema_inference_device_ids = get_ema_gpu_id_list( master_device_num=master_device_num) ema_model = ema_model.to(master_device_num) ema_model = torch.nn.DataParallel( ema_model, device_ids=ema_inference_device_ids) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(ema_model, dev_iter, master_device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval( cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval( cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'label': 'ema', 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) if not debug_mode: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|v02_f1({v02_sp_f1})|v02_recall({v02_sp_recall})' \ f'|v05_f1({v05_sp_f1})|v05_recall({v05_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 64 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 5000 do_lower_case = True ignore_non_verifiable = True experiment_name = f'fever_v0_plevel_retri_(ignore_non_verifiable:{ignore_non_verifiable})' debug_mode = False max_l = 264 # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl" ) dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl" ) # train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_jsonl(config.FEVER_DEV) train_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'train', train_ruleterm_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=True) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'dev', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) # Just to show the information fever_p_level_sampler.down_sample_neg(train_fitems, None) fever_p_level_sampler.down_sample_neg(dev_fitems, None) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(train_fitems) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) if not debug_mode: # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) random.shuffle(train_fitems) train_instance = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only( copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_02': score_02, 'score_05': score_05, } print(logging_item) s02_ss_score = score_02['ss'] s05_ss_score = score_05['ss'] if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v02_ofever({s02_ss_score})' \ f'|v05_ofever({s05_ss_score})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def eval_model_for_downstream_ablation(model_saved_path, doc_top_k=2, tag='dev'): print(f"Run doc_top_k:{doc_top_k}") bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True # forward_size = 256 forward_size = 256 # batch_size = 64 batch_size = 128 do_lower_case = True document_top_k = doc_top_k debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) test_list = common.load_json(config.TEST_FULLWIKI_FILE) # Load train eval results list # cur_train_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl") cur_dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) # cur_test_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl") # if tag == 'train': # train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True, # debug_mode=debug_mode) if tag == 'dev': dev_fitems = get_sentence_pair(document_top_k, dev_list, cur_dev_eval_results_list, is_training=False, debug_mode=debug_mode) # elif tag == 'test': # test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False, # debug_mode=debug_mode) if debug_mode: eval_frequency = 2 # dev_list = dev_list[:10] # dev_fitems_list = dev_fitems_list[:296] # train_fitems_list = train_fitems_list[:300] # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if tag == 'train': train_instance = bert_cs_reader.read(train_fitems) elif tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'train': train_iter = biterator(train_instance, num_epochs=1, shuffle=False) print(len(train_fitems)) elif tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) print(len(dev_fitems)) elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(test_fitems)) print("Forward size:", forward_size) if tag == 'train': cur_train_eval_results_list_out = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_train_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) elif tag == 'dev': cur_dev_eval_results_list_out = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_dev_eval_results_list_out, f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl") elif tag == 'test': cur_test_eval_results_list_out = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_test_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl" ) if tag == 'train' or tag == 'test': exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] common.save_json( logging_item, f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
def model_go_with_old_data(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'fever_v1_nli' lazy = False # lazy = True forward_size = 16 # batch_size = 64 # batch_size = 192 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 3 eval_frequency = 2000 do_lower_case = True pair_order = 'cq' # debug_mode = True debug_mode = False # est_datasize = 900_000 num_class = 3 # num_train_optimization_steps train_sent_filtering_prob = 0.35 dev_sent_filtering_prob = 0.1 # dev_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json" # train_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl" from utest.utest_format_converter_for_old_sent.tool import format_convert dev_sent_results_file = format_convert( config.PRO_ROOT / "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/4-15-dev_sent_pred_scores_old_format.jsonl" ) train_sent_results_file = format_convert( config.PRO_ROOT / "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/train_sent_scores_old_format.jsonl" ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_fitems_list = get_inference_pair('train', True, train_sent_results_file, debug_mode, train_sent_filtering_prob) dev_debug_num = 2481 if debug_mode else None dev_fitems_list, dev_list = get_inference_pair('dev', False, dev_sent_results_file, dev_debug_num, dev_sent_filtering_prob) # = common.load_jsonl(config.FEVER_DEV) if debug_mode: dev_list = dev_list[:50] eval_frequency = 1 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_debug_num = 2971 if debug_mode else None train_fitems_list, _ = get_inference_pair('train', True, train_sent_results_file, train_debug_num, train_sent_filtering_prob) est_datasize = len(train_fitems_list) # dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=364, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) train_fitems_list, _ = get_inference_pair('train', True, train_sent_results_file, train_debug_num, train_sent_filtering_prob) random.shuffle(train_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True) results_dict = list_dict_data_tool.list_to_dict( cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_list, results_dict, 'id', 'predicted_label') mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_fitems_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } save_file_name = f'i({update_step})|e({epoch_i})' \ f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ f'|seed({seed})' common.save_jsonl( copied_dev_list, Path(file_path_prefix) / f"{save_file_name}_dev_nli_results.json") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5): bert_model_name = 'bert-base-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' lazy = False forward_size = 32 do_lower_case = True pair_order = 'cq' debug_mode = False maxout_model = False num_class = 3 tag = 'dev' exp = 'no_re_train' print("Filter value:", filter_value) print("top_k_sent:", top_k_sent) train_sent_filtering_prob = 0.2 dev_sent_filtering_prob = filter_value test_sent_filtering_prob = 0.2 # Data dataset and upstream sentence results. dev_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl") # train_sent_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl") test_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl") dev_fitems, dev_list = get_nli_pair('dev', is_training=False, sent_level_results_list=dev_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob) # train_fitems, train_list = get_nli_pair('train', is_training=True, # sent_level_results_list=train_sent_results_list, debug=debug_mode, # sent_top_k=5, sent_filter_value=train_sent_filtering_prob) test_fitems, test_list = get_nli_pair('test', is_training=False, sent_level_results_list=test_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') if debug_mode: dev_list = dev_list[:100] # train_list = train_list[:100] test_list = test_list[:100] eval_frequency = 2 # est_datasize = len(train_fitems) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=384, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) if not maxout_model: model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) else: model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2) model.load_state_dict(torch.load(model_path)) dev_instances = bert_cs_reader.read(dev_fitems) # train_instances = bert_cs_reader.read(train_fitems) test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl") mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item) common.save_json(logging_item, f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl") elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_test_list = copy.deepcopy(test_list) list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")