def eval_ensemble(): sent_file = config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl" dev_sent_filtering_prob = 0.01 tag = 'dev' top_k = 5 # dev_list = common.load_jsonl(config.FEVER_DEV) dev_sent_results_list = common.load_jsonl(sent_file) dev_fitems, dev_list = get_nli_pair( tag, is_training=False, sent_level_results_list=dev_sent_results_list, debug=False, sent_top_k=top_k, sent_filter_value=dev_sent_filtering_prob) pred_file_list = [ config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_label_results_th0.2.jsonl", config.PRO_ROOT / "data/p_fever/fever_nli/04-26-10:15:39_fever_v2_nli_th0.2/ema_i(14000)|e(2)|ss(0.6991199119911992)|ac(0.7492249224922493)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl", config.PRO_ROOT / "data/p_fever/fever_nli/04-27-10:03:27_fever_v2_nli_th0.2/ema_i(26000)|e(3)|ss(0.6958695869586958)|ac(0.7447744774477447)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl", ] pred_d_list = [common.load_jsonl(file) for file in pred_file_list] final_list = ensemble_nli_results(pred_d_list) pred_list = final_list ema_results_dict = list_dict_data_tool.list_to_dict(pred_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') dev_list = common.load_jsonl(config.FEVER_DEV) mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item)
def eval_nli(): dev_list = common.load_jsonl(config.FEVER_DEV) # prediction_file = config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_cp_results_th0.2.jsonl" # prediction_file = config.PRO_ROOT / "saved_models/04-15-00:15:59_fever_v1_nli/i(18000)|e(2)|ss(0.6154615461546155)|ac(0.6701170117011701)|pr(0.26657540754071885)|rec(0.8852385238523852)|f1(0.40975857963668794)|seed(12)_dev_nli_results.json" prediction_file = config.PRO_ROOT / "data/p_fever/non_sent_level/ema_i(32000)|e(4)|ss(0.5592059205920592)|ac(0.6104110411041104)|pr(0.2638851385138135)|rec(0.8928142814281428)|f1(0.4073667130110584)|seed(12)_dev_nli_results.json" pred_list = common.load_jsonl(prediction_file) mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( pred_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item) fever_scorer.fever_confusion_matrix(pred_list, dev_list)
def evidence_adjustment(tag, sent_file, label_file, filter_prob=0.2, top_k=5): dev_sent_filtering_prob = filter_prob # dev_list = common.load_jsonl(config.FEVER_DEV) dev_sent_results_list = common.load_jsonl(sent_file) dev_fitems, dev_list = get_nli_pair( tag, is_training=False, sent_level_results_list=dev_sent_results_list, debug=False, sent_top_k=top_k, sent_filter_value=dev_sent_filtering_prob) cur_eval_results_list = common.load_jsonl(label_file) ema_results_dict = list_dict_data_tool.list_to_dict( cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') mode = {'standard': True} # delete_unused_evidence(copied_dev_list) strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item)
def model_go_with_old_data(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'fever_v1_nli' lazy = False # lazy = True forward_size = 16 # batch_size = 64 # batch_size = 192 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 3 eval_frequency = 2000 do_lower_case = True pair_order = 'cq' # debug_mode = True debug_mode = False # est_datasize = 900_000 num_class = 3 # num_train_optimization_steps train_sent_filtering_prob = 0.35 dev_sent_filtering_prob = 0.1 # dev_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json" # train_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl" from utest.utest_format_converter_for_old_sent.tool import format_convert dev_sent_results_file = format_convert( config.PRO_ROOT / "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/4-15-dev_sent_pred_scores_old_format.jsonl" ) train_sent_results_file = format_convert( config.PRO_ROOT / "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/train_sent_scores_old_format.jsonl" ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_fitems_list = get_inference_pair('train', True, train_sent_results_file, debug_mode, train_sent_filtering_prob) dev_debug_num = 2481 if debug_mode else None dev_fitems_list, dev_list = get_inference_pair('dev', False, dev_sent_results_file, dev_debug_num, dev_sent_filtering_prob) # = common.load_jsonl(config.FEVER_DEV) if debug_mode: dev_list = dev_list[:50] eval_frequency = 1 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_debug_num = 2971 if debug_mode else None train_fitems_list, _ = get_inference_pair('train', True, train_sent_results_file, train_debug_num, train_sent_filtering_prob) est_datasize = len(train_fitems_list) # dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=364, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) train_fitems_list, _ = get_inference_pair('train', True, train_sent_results_file, train_debug_num, train_sent_filtering_prob) random.shuffle(train_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True) results_dict = list_dict_data_tool.list_to_dict( cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_list, results_dict, 'id', 'predicted_label') mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_fitems_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } save_file_name = f'i({update_step})|e({epoch_i})' \ f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ f'|seed({seed})' common.save_jsonl( copied_dev_list, Path(file_path_prefix) / f"{save_file_name}_dev_nli_results.json") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_go_pure_aug(): # for some_params in [0.25, 0.25, 0.25]: for some_params in [0.25, 0.25, 0.25]: # bert_model_name = 'bert-large-uncased' seed = 6 bert_model_name = 'bert-base-uncased' lazy = False forward_size = 16 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 3 do_ema = False dev_prob_threshold = 0.1 train_prob_threshold = 0.35 debug_mode = False # experiment_name = f"bert_fever_nli_baseline_on_fulldata" # experiment_name = f"bert_fever_nli_baseline_on_fulldata_aug_the_same_gt_mrate({some_params})" # experiment_name = f"bert_fever_nli_baseline_on_10p_aug_ratio({some_params})" experiment_name = f"bert_fever_nli_baseline_on_fulldata_aug_ratio({some_params})" # experiment_name = f"bert_fever_nli_baseline_pure_aug" data_aug = True # data_aug_file = config.FEVER_DATA_ROOT / "qa_aug/squad_train_turker_groundtruth.json" # data_aug_size = int(21_015 * some_params) # 10p # data_aug_size = int(208_346 * some_params) # training_file = config.FEVER_DATA_ROOT / "fever_1.0/train_10.jsonl" training_file = config.FEVER_DATA_ROOT / "fever_1.0/train.jsonl" train_sample_top_k = 8 # est_datasize = 208_346 # full # est_datasize = 14_544 # est_datasize = 21_015 + data_aug_size # 10p aug_size = int(208_346 * some_params) est_datasize = 208_346 + aug_size # est_datasize = 208_346 + data_aug_size num_class = 3 # num_train_optimization_steps torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Finished build vocabulary. # Load standardized sentence file dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl") dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique( config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl", dev_upstream_sent_list, prob_threshold=dev_prob_threshold, top_n=5) dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval( config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl", dev_sent_after_threshold_filter, None, tokenized=True) # print(dev_data_list[0]) # exit(0) train_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / "upstream_sentence_selection_Feb16/train_sent_scores.jsonl") # Finished loading standardized sentence file. bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True) bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy) dev_instances = bert_fever_reader.read(dev_data_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # print(list(mnli_dev_instances)) # Load training model # Load training model model_clf = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_class) ema_tracker = None ema_model_copy = None if do_ema and ema_tracker is None: ema_tracker = EMA(model_clf.named_parameters(), on_cpu=True) ema_model_copy = copy.deepcopy(model_clf) model_clf.to(device) param_optimizer = list(model_clf.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs print(num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) # optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end model_clf.train() if n_gpu > 1: model_clf = nn.DataParallel(model_clf) forbackward_step = 0 update_step = 0 eval_iter_num = 2_000 # Change this to real evaluation. best_fever_score = -1 for n_epoch in range(num_train_epochs): print("Resampling...") train_sent_after_threshold_filter = \ fever_ss_sampler.threshold_sampler_insure_unique(training_file, train_upstream_sent_list, train_prob_threshold, top_n=train_sample_top_k) # train_data_list = fever_nli_sampler.adv_simi_sample_with_prob_v1_1( training_file, train_sent_after_threshold_filter, None, tokenized=True) aug_d_list = [] if data_aug: aug_d_list = get_sample_data(-1) random.shuffle(aug_d_list) aug_d_list = aug_d_list[:aug_size] train_data_list = train_data_list + aug_d_list random.shuffle(train_data_list) # train_data_list = get_sample_data(-1) print("Sample data length:", len(train_data_list)) sampled_train_instances = bert_fever_reader.read(train_data_list) # train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1) for i, batch in enumerate(tqdm(train_iter)): paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) paired_sequence = paired_sequence.to(device) paired_segments_ids = paired_segments_ids.to(device) labels_ids = labels_ids.to(device) att_mask = att_mask.to(device) loss = model_clf(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if do_ema and ema_tracker is not None: # if model_clf is DataParallel, then we use model_clf.module model_to_track = model_clf.module if hasattr(model_clf, 'module') else model_clf ema_tracker(model_to_track.named_parameters()) # Whenever we do update, the do ema update if update_step % eval_iter_num == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) if do_ema and ema_model_copy is not None and ema_tracker is not None: print("EMA evaluation.") EMA.load_ema_to_model(ema_model_copy, ema_tracker) ema_model_copy.to(device) if n_gpu > 1: ema_model_copy = nn.DataParallel(ema_model_copy) dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device) else: dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device) eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list, common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"), mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}") if best_fever_score < fever_score: print("New Best FScore") best_fever_score = fever_score save_path = os.path.join( file_path_prefix, f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})' ) model_to_save = model_clf.module if hasattr(model_clf, 'module') else model_clf output_model_file = os.path.join(file_path_prefix, save_path) torch.save(model_to_save.state_dict(), output_model_file) print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) if do_ema and ema_model_copy is not None and ema_tracker is not None: print("EMA evaluation.") EMA.load_ema_to_model(ema_model_copy, ema_tracker) ema_model_copy.to(device) if n_gpu > 1: ema_model_copy = nn.DataParallel(ema_model_copy) dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device) else: dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device) eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list, common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"), mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}") if best_fever_score < fever_score: print("New Best FScore") best_fever_score = fever_score save_path = os.path.join( file_path_prefix, f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})' ) model_to_save = model_clf.module if hasattr(model_clf, 'module') else model_clf output_model_file = os.path.join(file_path_prefix, save_path) torch.save(model_to_save.state_dict(), output_model_file)
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'): # term_retrieval_top_k = 20 # term_retrieval_top_k = 20 # term_retrieval_top_k = 3 # match_filtering_k = 2 if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") d_tf_idf = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl" ) tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id') r_list = [] ner_set = get_title_entity_set() g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) for item in tqdm(d_list): cur_id = str(item['id']) query = item['claim'] query_terms = get_query_ngrams(query) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # print(tf_idf_doc_list) get_kw_matching_results(query, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list'] added_count = 0 for score, title in sorted(tf_idf_doc_list, key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id( title) and not title.startswith('List of '): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break predicted_docids = retrieved_set.to_id_list() # print(retrieved_set) # print(item['claim'], predicted_docids) r_item = dict() r_item['id'] = int(cur_id) r_item['claim'] = item['claim'] r_item['predicted_docids'] = predicted_docids if tag != 'test': r_item['label'] = item['label'] r_list.append(r_item) # r_list = common.load_jsonl('dev-debug.jsonl') # We need to modify the existing retrieved document for naming consistency for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # Modify finished # print(r_list[0:10]) len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list)) common.save_jsonl( r_list, f'fever_term_based_retri_results_' f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) mode = {'standard': False, 'check_doc_id_correct': True} # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
def model_eval(model_save_path): seed = 6 bert_model_name = 'bert-base-uncased' lazy = False forward_size = 16 batch_size = 32 # dev_prob_threshold = 0.05 dev_prob_threshold = 0.1 num_class = 3 # num_train_optimization_steps torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Finished build vocabulary. # Load standardized sentence file # dev_upstream_sent_list = common.load_jsonl(config.RESULT_PATH / # "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json") # dev_upstream_sent_list = common.load_jsonl(config.DATA_ROOT / # "utest_data/dev_sent_score_2_shared_task_dev.jsonl") # "utest_data/dev_sent_score_1_shared_task_dev_docnum(10)_ensembled.jsonl") # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / # "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl") dev_upstream_sent_list = common.load_jsonl( config.FEVER_DATA_ROOT / "upstream_sentence_selection_Feb16/4-15-dev_sent_pred_scores.jsonl") # # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / # "upstream_sentence_selection_Feb16/4-15-test_sent_pred_scores.jsonl") # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / # "upstream_sentence_selection_Feb16/n_dev_sent_pred_scores.jsonl") # dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique_new_format( dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique( config.FEVER_DEV, dev_upstream_sent_list, prob_threshold=dev_prob_threshold, top_n=5) dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval( config.FEVER_DEV, dev_sent_after_threshold_filter, None, tokenized=True) # dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique( # config.FEVER_TEST, # dev_upstream_sent_list, # prob_threshold=dev_prob_threshold, top_n=5) # # dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval( # config.FEVER_TEST, dev_sent_after_threshold_filter, # None, tokenized=True, pipeline=True) for item in dev_data_list: item['label'] = 'hidden' dev_list = common.load_jsonl(config.FEVER_DEV) for a, b in zip(dev_list, dev_data_list): del b['label'] b['predicted_label'] = a['label'] eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score( dev_data_list, dev_list, mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}") bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True) bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy) dev_instances = bert_fever_reader.read(dev_data_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # print(list(mnli_dev_instances)) # Load training model # Load training model model_clf = BertForSequenceClassification.from_pretrained( bert_model_name, num_labels=num_class) model_clf.load_state_dict(torch.load(model_save_path)) model_clf.to(device) model_clf.eval() if n_gpu > 1: model_clf = nn.DataParallel(model_clf) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) # for item in dev_data_list: dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device) common.save_jsonl( dev_data_list, config.PRO_ROOT / "data/fever/upstream_sentence_selection_Feb16/4-15-dev_nli_results.jsonl" ) eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score( dev_data_list, common.load_jsonl(config.FEVER_DEV), mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}")
def eval_trainset_for_train_nli(model_path): tag = 'test' is_training = False seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 128 # batch_size = 64 # batch_size = 192 batch_size = 128 do_lower_case = True debug_mode = False # debug_mode = True num_class = 1 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_fitems_list = get_sentences(tag, is_training=is_training, debug=debug_mode) est_datasize = len(train_fitems_list) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_path)) print("Estimated training size", est_datasize) print("Estimated forward steps:", est_datasize / forward_size) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, make_int=True, show_progress=True) if debug_mode: train_list = common.load_jsonl(config.FEVER_TRAIN) train_list = train_list[:50] set_gt_nli_label(train_list) train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id') copied_dev_o_dict = copy.deepcopy(train_o_dict) copied_dev_d_list = copy.deepcopy(train_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True) print("Threshold 0.5:") cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, threshold=0.1) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') mode = {'standard': True, 'check_sent_id_correct': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_d_list, train_list, mode=mode, max_evidence=5) print(strict_score, acc_score, pr, rec, f1) common.save_jsonl(cur_eval_results_list, f'{tag}_sent_results_labeled:{is_training}.jsonl')
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'fever_v0_cs_ratio_001' # lazy = False lazy = True forward_size = 128 # batch_size = 64 # batch_size = 192 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 20000 pos_ratio = 0.01 do_lower_case = True # debug_mode = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_list = common.load_jsonl(config.FEVER_TRAIN) dev_list = common.load_jsonl(config.FEVER_DEV) set_gt_nli_label(dev_list) # dev_fitems_list = common.load_jsonl( # config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") # train_fitems_list = common.load_jsonl( # config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") dev_fitems_list = get_sentences('dev', is_training=False, debug=debug_mode) train_fitems_list = get_sentences('train', is_training=True, debug=debug_mode) if debug_mode: dev_list = dev_list[:50] eval_frequency = 1 # print(dev_list[-1]['_id']) # exit(0) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(sampled_train_list) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_instance = bert_cs_reader.read(sampled_train_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True) print("Threshold 0.5:") cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, threshold=0.5) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') mode = {'standard': True, 'check_sent_id_correct': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_d_list, dev_list, mode=mode, max_evidence=5) score_05 = { 'ss': strict_score, 'as': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print("Threshold 0.1:") cur_results_dict_th0_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, threshold=0.1) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_evidence') mode = {'standard': True, 'check_sent_id_correct': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_d_list, dev_list, mode=mode, max_evidence=5) score_01 = { 'ss': strict_score, 'as': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_01': score_01, 'score_05': score_05, } print(logging_item) s01_ss_score = score_01['ss'] s05_ss_score = score_05['ss'] # # exit(0) # print(logging_item) save_file_name = f'i({update_step})|e({epoch_i})' \ f'|s01({s01_ss_score})|s05({s05_ss_score})' \ f'|seed({seed})' common.save_jsonl( cur_eval_results_list, Path(file_path_prefix) / f"{save_file_name}_dev_sent_results.json") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file)) # print(logging_agent.logging_item_list) # Epoch eval: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True) print("Threshold 0.5:") cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, threshold=0.5) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') mode = {'standard': True, 'check_sent_id_correct': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_d_list, dev_list, mode=mode, max_evidence=5) score_05 = { 'ss': strict_score, 'as': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print("Threshold 0.1:") cur_results_dict_th0_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, threshold=0.1) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_evidence') mode = {'standard': True, 'check_sent_id_correct': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_d_list, dev_list, mode=mode, max_evidence=5) score_01 = { 'ss': strict_score, 'as': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_01': score_01, 'score_05': score_05, } print(logging_item) s01_ss_score = score_01['ss'] s05_ss_score = score_05['ss'] # # exit(0) # print(logging_item) save_file_name = f'i({update_step})|e({epoch_i})' \ f'|s01({s01_ss_score})|s05({s05_ss_score})' \ f'|seed({seed})' common.save_jsonl( cur_eval_results_list, Path(file_path_prefix) / f"{save_file_name}_dev_sent_results.jsonl") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5): bert_model_name = 'bert-base-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' lazy = False forward_size = 32 do_lower_case = True pair_order = 'cq' debug_mode = False maxout_model = False num_class = 3 tag = 'dev' exp = 'no_re_train' print("Filter value:", filter_value) print("top_k_sent:", top_k_sent) train_sent_filtering_prob = 0.2 dev_sent_filtering_prob = filter_value test_sent_filtering_prob = 0.2 # Data dataset and upstream sentence results. dev_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl") # train_sent_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl") test_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl") dev_fitems, dev_list = get_nli_pair('dev', is_training=False, sent_level_results_list=dev_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob) # train_fitems, train_list = get_nli_pair('train', is_training=True, # sent_level_results_list=train_sent_results_list, debug=debug_mode, # sent_top_k=5, sent_filter_value=train_sent_filtering_prob) test_fitems, test_list = get_nli_pair('test', is_training=False, sent_level_results_list=test_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') if debug_mode: dev_list = dev_list[:100] # train_list = train_list[:100] test_list = test_list[:100] eval_frequency = 2 # est_datasize = len(train_fitems) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=384, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) if not maxout_model: model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) else: model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2) model.load_state_dict(torch.load(model_path)) dev_instances = bert_cs_reader.read(dev_fitems) # train_instances = bert_cs_reader.read(train_fitems) test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl") mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item) common.save_json(logging_item, f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl") elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_test_list = copy.deepcopy(test_list) list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")
def model_go(th_filter_prob=0.2, top_k_sent=5): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' lazy = False # lazy = True forward_size = 32 # batch_size = 64 # batch_size = 192 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 # schedule_type = 'warmup_constant' # 'warmup_cosine': warmup_cosine, # 'warmup_constant': warmup_constant, # 'warmup_linear': warmup_linear, schedule_type = 'warmup_linear' learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 4000 do_lower_case = True pair_order = 'cq' # debug_mode = True # debug_mode = True debug_mode = False do_ema = True maxout_model = False # est_datasize = 900_000 num_class = 3 # num_train_optimization_steps top_k = top_k_sent train_sent_filtering_prob = th_filter_prob dev_sent_filtering_prob = th_filter_prob experiment_name = f'fever_v2_nli_th{train_sent_filtering_prob}_tk{top_k}' # Data dataset and upstream sentence results. dev_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl") train_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl") dev_fitems, dev_list = get_nli_pair('dev', is_training=False, sent_level_results_list=dev_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob) train_fitems, train_list = get_nli_pair('train', is_training=True, sent_level_results_list=train_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=train_sent_filtering_prob) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') if debug_mode: dev_list = dev_list[:100] train_list = train_list[:100] eval_frequency = 2 est_datasize = len(train_fitems) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=384, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) if not maxout_model: model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) else: model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=1) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) print("Do EMA:", do_ema) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps, schedule=schedule_type) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) train_fitems_list, _ = get_nli_pair('train', is_training=True, sent_level_results_list=train_sent_results_list, debug=debug_mode, sent_top_k=5, sent_filter_value=train_sent_filtering_prob) random.shuffle(train_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] if not maxout_model: loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) else: loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, s1_span=s1_span, s2_span=s2_span, mode=BertPairMaxOutMatcher.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr(model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) # # cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True, # feed_input_span=maxout_model) # # ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') # copied_dev_list = copy.deepcopy(dev_list) # list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, # 'id', 'predicted_label') # # mode = {'standard': True} # strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, # mode=mode, max_evidence=5) # logging_item = { # 'ss': strict_score, 'ac': acc_score, # 'pr': pr, 'rec': rec, 'f1': f1, # } # # if not debug_mode: # save_file_name = f'i({update_step})|e({epoch_i})' \ # f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ # f'|seed({seed})' # # common.save_jsonl(copied_dev_list, Path(file_path_prefix) / # f"{save_file_name}_dev_nli_results.json") # # # print(save_file_name) # logging_agent.incorporate_results({}, save_file_name, logging_item) # logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") # # model_to_save = model.module if hasattr(model, 'module') else model # output_model_file = Path(file_path_prefix) / save_file_name # torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: ema_model = ema.get_inference_model() ema_device_num = 0 ema_model = ema_model.to(device) ema_model = torch.nn.DataParallel(ema_model) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_ema_eval_results_list = eval_model(ema_model, dev_iter, ema_device_num, with_probs=True, make_int=True, feed_input_span=maxout_model) ema_results_dict = list_dict_data_tool.list_to_dict(cur_ema_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, mode=mode, max_evidence=5) ema_logging_item = { 'label': 'ema', 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } if not debug_mode: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ f'|seed({seed})' common.save_jsonl(copied_dev_list, Path(file_path_prefix) / f"{save_file_name}_dev_nli_results.json") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, ema_logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr(ema_model, 'module') else ema_model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))