def prune_heads(args, model, eval_dataloader, head_mask): """ This method shows how to prune head (remove heads weights) based on the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ # Try pruning and test time speedup # Pruning is like masking but we actually remove the masked weights before_time = datetime.now() _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name] original_time = datetime.now() - before_time original_num_params = sum(p.numel() for p in model.parameters()) heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask))) assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() model.prune_heads(heads_to_prune) pruned_num_params = sum(p.numel() for p in model.parameters()) before_time = datetime.now() _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name] new_time = datetime.now() - before_time logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100) logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
def mask_heads(args, model, eval_dataloader): """ This method shows how to mask head (set some heads to zero), to test the effect on the network, based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ _, head_importance, preds, labels = compute_heads_importance( args, model, eval_dataloader, compute_entropy=False) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) new_head_mask = torch.ones_like(head_importance) num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) current_score = original_score while current_score >= original_score * args.masking_threshold: head_mask = new_head_mask.clone() # save current head mask # heads from least important to most - keep only not-masked heads head_importance[head_mask == 0.0] = float('Inf') current_heads_to_mask = head_importance.view(-1).sort()[1] if len(current_heads_to_mask) <= num_to_mask: break # mask heads current_heads_to_mask = current_heads_to_mask[:num_to_mask] logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) new_head_mask = new_head_mask.view(-1) new_head_mask[current_heads_to_mask] = 0.0 new_head_mask = new_head_mask.view_as(head_mask) print_2d_tensor(new_head_mask) # Compute metric and head importance again _, head_importance, preds, labels = compute_heads_importance( args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask) preds = np.argmax( preds, axis=1 ) if args.output_mode == "classification" else np.squeeze(preds) current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info( "Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum() / new_head_mask.numel() * 100) logger.info("Final head mask") print_2d_tensor(head_mask) np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy()) return head_mask
def predict(model, eval_datasets, step, args): eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_output_dir = args.output_dir results = {} for eval_task, eval_dataset in zip(eval_task_names, eval_datasets): if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) logger.info("Predicting...") logger.info("***** Running predictions *****") logger.info(" task name = %s", eval_task) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.predict_batch_size) eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() pred_logits = [] label_ids = [] for batch in tqdm(eval_dataloader, desc="Evaluating", disable=None): input_ids, input_mask, segment_ids, labels = batch input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) with torch.no_grad(): logits = model(input_ids, input_mask, segment_ids) cpu_logits = logits.detach().cpu() for i in range(len(cpu_logits)): pred_logits.append(cpu_logits[i].numpy()) label_ids.append(labels[i]) pred_logits = np.array(pred_logits) label_ids = np.array(label_ids) if args.output_mode == "classification": preds = np.argmax(pred_logits, axis=1) else: # args.output_mode == "regression": preds = np.squeeze(pred_logits) result = compute_metrics(eval_task, preds, label_ids) logger.info(f"task:,{eval_task}") logger.info(f"result: {result}") results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results-%s.txt" % eval_task) with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} task {} *****".format( step, eval_task)) writer.write("step: %d ****\n " % step) for key in sorted(results.keys()): logger.info("%s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) model.train() return results
def predict_ens(models, eval_datasets, step, args): eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_output_dir = args.output_dir task_results = {} for eval_task, eval_dataset in zip(eval_task_names, eval_datasets): if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) logger.info("Predicting...") logger.info("***** Running predictions *****") logger.info(" task name = %s", eval_task) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.predict_batch_size) eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.predict_batch_size) for model in models: model.eval() pred_logits = [] label_ids = [] for batch in tqdm(eval_dataloader, desc="Evaluating", disable=None): input_ids, input_mask, segment_ids, labels = batch input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) with torch.no_grad(): logits_list = [ model(input_ids, input_mask, segment_ids) for model in models ] logits = sum(logits_list) / len(logits_list) pred_logits.append(logits.detach().cpu()) label_ids.append(labels) pred_logits = np.array(torch.cat(pred_logits), dtype=np.float32) label_ids = np.array(torch.cat(label_ids), dtype=np.int64) preds = np.argmax(pred_logits, axis=1) results = compute_metrics(eval_task, preds, label_ids) logger.info("***** Eval results {} task {} *****".format( step, eval_task)) for key in sorted(results.keys()): logger.info(f"{eval_task} {key} = {results[key]:.5f}") task_results[eval_task] = results output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") write_results(output_eval_file, step, task_results, eval_task_names) for model in models: model.train() return task_results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3]} outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def predict(model, eval_datasets, step, args, examples=None, label_list=None): """ :param model: :param eval_datasets: :param step: :param args: :param examples: 样本集合,可以打印前N条样本的预测结果 :return: """ eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_output_dir = args.output_dir results = {} for eval_task, eval_dataset in zip(eval_task_names, eval_datasets): if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) logger.info("开始预测...") logger.info("***** Running predictions *****") logger.info(" 任务名称 = %s", eval_task) logger.info(" 样本数 = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.predict_batch_size) eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() #起始时间 start_time = time.time() pred_logits = [] label_ids = [] for batch in tqdm(eval_dataloader, desc="Evaluating", disable=None): input_ids, input_mask, segment_ids, labels = batch input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) with torch.no_grad(): logits = model(input_ids, input_mask, segment_ids) cpu_logits = logits.detach().cpu() for i in range(len(cpu_logits)): pred_logits.append(cpu_logits[i].numpy()) label_ids.append(labels[i]) pred_logits = np.array(pred_logits) # 所有真实的labels id label_ids = np.array(label_ids) if args.output_mode == "classification": # 所有预测的label id preds = np.argmax(pred_logits, axis=1) else: # args.output_mode == "regression": preds = np.squeeze(pred_logits) result = compute_metrics(eval_task, preds, label_ids) #随机取100个样本,查看结果 if examples and label_list: #抽取100个 num_example = 100 print(f"随机打印{num_example}个预测结果") total_examples = len(label_ids) display_examples = random.sample(range(total_examples), num_example) print("样本 关键字 真实标签 预测标签") for exp_idx in display_examples: print('%30s %10s %8s %8s' % (examples[exp_idx].text_a, examples[exp_idx].text_b, examples[exp_idx].label, label_list[preds[exp_idx]])) if examples and label_list: # 保存所有结果到excel data_dict = [] for idx in range(total_examples): data_dict.append({ '样本': examples[idx].text_a, '关键字': examples[idx].text_b, '真实标签': examples[idx].label, '预测标签': label_list[preds[idx]] }) import pandas as pd df = pd.DataFrame(data_dict) writer = pd.ExcelWriter("eval_result.xlsx") df.to_excel(writer) writer.save() logger.info(f"task:,{eval_task}") logger.info(f"result: {result}") results.update(result) cost_time = time.time() - start_time logger.info( f"--- 评估{len(eval_dataset)}条数据的总耗时是 {cost_time} seconds, 每条耗时 {cost_time/len(eval_dataset)} seconds ---" ) output_eval_file = os.path.join(eval_output_dir, "eval_results-%s.txt" % eval_task) with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} task {} *****".format( step, eval_task)) writer.write("step: %d ****\n " % step) for key in sorted(results.keys()): logger.info("%s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) model.train() return results
def evaluate(args, model, tokenizer, data_type="dev", prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type=data_type) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running {} evaluation {} *****".format(data_type, prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None # epoch_iterator = tqdm(eval_dataloader, desc="Evaluating") epoch_iterator = eval_dataloader for batch in epoch_iterator: model.eval() batch = tuple(t.to(args.device) for t in batch) input_ids, attention_mask, token_type_ids, labels = batch[0], batch[1], batch[2], batch[3] inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids if args.model_type in ['bert', 'xlnet'] \ and not args.no_segment else None, # XLM and RoBERTa don't use segment_ids 'labels': labels} with torch.no_grad(): outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) if data_type == "dev": result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_{}_results.txt".format(data_type)) with open(output_eval_file, "w") as writer: logger.info("***** Eval {} results {} *****".format(data_type, prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) else: def get_glue_task_name(task_name): if task_name == "cola": return "CoLA" elif task_name == "sst-2": return "SST-2" elif task_name == "mrpc": return "MRPC" elif task_name == "sts-b": return "STS-B" elif task_name == "qqp": return "QQP" elif task_name == "mnli": return "MNLI-m" elif task_name == "mnli-mm": return "MNLI-mm" elif task_name == "qnli": return "QNLI" elif task_name == "rte": return "RTE" elif task_name == "wnli": return "WNLI" else: raise KeyError(task_name) label_list = processors[args.task_name]().get_labels() if args.task_name in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] headers = ["index", "predictioin"] file_saved = "{}.tsv".format(get_glue_task_name(eval_task)) output_test_file = os.path.join(eval_output_dir, file_saved) with open(output_test_file, "w", encoding='utf-8') as f: logger.info("Save {} as GLUE data format".format(file_saved)) writer = csv.writer(f, delimiter="\t") writer.writerow(headers) for index, pred in enumerate(preds): if label_list[0] is None: writer.writerow([index, str(pred)]) else: writer.writerow([index, label_list[pred]]) # print evaluation logs log_string = "Job_{}:".format(args.job_id) log_string += " {}_{}:".format(eval_task, data_type) if prefix != "": log_string += " step={:<8d}".format(prefix) log_string += " {}_loss={:<8.5f}".format(data_type, eval_loss) for key in sorted(results.keys()): log_string += " {}_{}={:<8.5f}".format(data_type, key, results[key]) logger.info(log_string) return results
def evaluate(args, model, tokenizer, prefix="", test=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): if test: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True, test_data=True) else: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True, test_data=False) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds_prob = copy.deepcopy(preds) preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) logger.info('dev length ' + str(len(preds)) + ' 1 nums ' + str(preds.sum())) if test: if not args.do_entity: test_o = pd.read_csv( '/userhome/project/data_final/Test_Data_Title_processed_anzhaochusai.csv' ) test_save = pd.DataFrame({ 'id': test_o['id'], 'negative': preds, 'key_entity': np.nan }) test_save.to_csv(os.path.join(eval_output_dir, 'result.csv'), index=False) np.save(os.path.join(eval_output_dir, 'test_prob.npy'), preds_prob) else: test_o = pd.read_csv(os.path.join(args.data_dir, 'test.tsv'), sep='\t') test_save = pd.DataFrame({ 'id': test_o['index'], 'negative': preds }) test_save.to_csv(os.path.join(eval_output_dir, 'result.csv'), index=False) np.save(os.path.join(eval_output_dir, 'test_prob.npy'), preds_prob) if prefix == 'save_eval_result': eval_o = pd.read_csv(os.path.join(args.data_dir, 'dev.tsv'), sep='\t') eval_save = pd.DataFrame({ 'sentence': eval_o['sentence'], 'negative': preds, '0prob': preds_prob[:, 0], '1prob': preds_prob[:, 1] }) eval_save.to_csv(os.path.join(eval_output_dir, 'eval_result.csv'), index=False) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, prefix="", TEST=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): if TEST: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True, TEST=TEST) else: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3]} outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": if TEST: scores = preds[:,1] preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) if TEST: # do the full evaluation of Max F1, PR-AUC and P@1 DATA_FOLDER = os.path.join("..", "..", "train_data") test_file = os.path.join(DATA_FOLDER, "test_shortest_count2_squad_final_train_data_features_with_new_info.tsv") test_data = pd.read_csv(test_file, sep='\t', header=None) test_bucket_indices = verify_and_generate_bucket_indices(test_data, last_column_index=104) acc, cm, roc_auc, pr_auc, ap, f1_max, p_max, r_max, precision, recall, thresholds, MRR, precision_at_1, counter_all_pos, classification_report, classification_report_str = my_evaluate(scores, preds, out_label_ids, test_bucket_indices) print("Accuracy:{}".format(acc)) print("ROC_AUC_SCORE:{}".format(roc_auc)) print("PR_AUC_score:{}".format(pr_auc)) print("Average Precision Score:{}".format(ap)) print("Max F1:{}".format(f1_max)) print("Precision for max F1:{}".format(p_max)) print("Recall for max F1:{}".format(r_max)) print("MRR:{}".format(MRR)) print("Precision@1:{}".format(precision_at_1)) print("All Pos. Counter:\n{}".format(counter_all_pos)) print("CM:\n{}".format(cm)) print("Classification report:\n{}".format(classification_report_str)) print("\n\n\n\n") result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_steps if args.warmup_steps >= 1 else int(t_total * args.warmup_steps) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() # train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) train_iterator = range(int(args.num_train_epochs)) set_seed(args) # Added here for reproductibility (even between python 2 and 3) first_time = time.time() best_result = 0.0 for idx_epoch in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) epoch_iterator = train_dataloader preds = None out_label_ids = None for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) input_ids, attention_mask, token_type_ids, labels = batch[0], batch[1], batch[2], batch[3] inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids if args.model_type in ['bert', 'xlnet'] \ and not args.no_segment else None, # XLM and RoBERTa don't use segment_ids 'labels': labels} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if preds is None: preds = outputs[1].detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, outputs[1].detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, data_type="dev", prefix=global_step) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) # current loss cur_loss = (tr_loss - logging_loss) / args.logging_steps tb_writer.add_scalar('loss', cur_loss, global_step) logging_loss = tr_loss # print log log_string = "Job_{}:".format(args.job_id) log_string += " epoch={:<3d}".format(idx_epoch) log_string += " step={:<8d}".format(global_step) log_string += " batch={:<4d}".format(labels.shape[0]) log_string += " lr={:<10.7f}".format(scheduler.get_lr()[0]) log_string += " train_loss={:<8.5f}".format(cur_loss) log_string += " |g|={:<10.7f}".format(total_norm) # calculate accuracy if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(args.task_name, preds, out_label_ids) for key in sorted(result.keys()): log_string += " {}_{}={:<8.5f}".format("train", key, result[key]) log_string += " mins={:<9.2f}".format(float(time.time() - first_time) / 60) logger.info(log_string) preds = None out_label_ids = None if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1 and not args.evaluate_during_training and args.evaluate_after_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, data_type="dev", prefix=global_step) metrics = result_for_sorting(args.task_name, results) if metrics >= best_result: best_result = metrics # Save model checkpoint output_dir = os.path.join(args.output_dir, 'best') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs')) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() #train_iterator = trange(int(args.num_train_epochs), desc="Epoch") #set_seed(args) # Added here for reproductibility keeping the seed the same... # TODO(robinjia): does calling set_seed a second time matter? train_results = {} for epoch in range(int(args.num_train_epochs)): preds = None out_label_ids = None epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) loss, logits = outputs[: 2] # model outputs are always tuple in pytorch-transformers (see doc) if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well raise NotImplementedError # TODO: make evaluation happen below #results = evaluate(args, model, tokenizer) #for key, value in results.items(): # tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.save_every_epoch: output_dir = os.path.join(args.output_dir, 'checkpoint-epoch{}'.format(epoch)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) results = compute_metrics(args.task_name, preds, out_label_ids) train_results[epoch] = results print("Train results: ", train_results) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() #TODO, hacky but saves more significant restructuring... args.train_results = train_results return global_step, tr_loss / global_step, train_results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 eval_accuracy = 0 nb_eval_examples = 0 preds = None all_out_label_ids = None n_classes = GLUE_TASKS_NUM_LABELS[args.task_name] print(args.task_name) print(n_classes) all_true_pos = [0.0 for c in range(n_classes)] all_true_neg = [0.0 for c in range(n_classes)] all_false_pos = [0.0 for c in range(n_classes)] all_false_neg = [0.0 for c in range(n_classes)] all_precision = [0.0 for c in range(n_classes)] all_recall = [0.0 for c in range(n_classes)] all_f1 = [0.0 for c in range(n_classes)] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 nb_eval_examples += inputs['input_ids'].size(0) print(nb_eval_examples) out_label_ids = inputs['labels'] print("out_label_ids") print(out_label_ids) if(args.task_name == "multilabel"): metric_results = metrics_with_thresh(logits, out_label_ids) eval_accuracy += metric_results['accuracy'] all_true_pos = [all_true_pos[c] + metric_results['true_pos'][c] for c in range(n_classes)] all_true_neg = [all_true_neg[c] + metric_results['true_neg'][c] for c in range(n_classes)] all_false_pos = [all_false_pos[c] + metric_results['false_pos'][c] for c in range(n_classes)] all_false_neg = [all_false_neg[c] + metric_results['false_neg'][c] for c in range(n_classes)] if all_out_label_ids is None: all_out_label_ids = out_label_ids.detach().cpu().numpy() else: all_out_label_ids = np.append(all_out_label_ids, out_label_ids.detach().cpu().numpy(), axis=0) if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) if(args.task_name == "multilabel"): print(all_true_pos) print(all_true_neg) print(all_false_pos) print(all_false_neg) for c in range(n_classes): try: all_precision[c] = all_true_pos[c] / (all_true_pos[c] + all_false_pos[c]) except ZeroDivisionError: all_precision[c] = float("Inf") try: all_recall[c] = all_true_pos[c] / (all_true_pos[c] + all_false_neg[c]) except ZeroDivisionError: all_recall[c] = float("Inf") try: all_f1[c] = 2 * all_precision[c] * all_recall[c] / (all_precision[c] + all_recall[c]) except ZeroDivisionError: all_f1[c] = float("Inf") eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_steps # if it was divided by nb_eval_examples, calculated accuracy would be 1/batch_size of real accuracy if args.output_mode == "classification": preds = np.argmax(preds, axis=1) result = compute_metrics(eval_task, preds, all_out_label_ids) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, all_out_label_ids) elif args.output_mode == "multi-classification": # ROC-AUC calcualation # Compute ROC curve and ROC area for each class print(len(all_out_label_ids[0])) if len(all_out_label_ids[0]) > 1: fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(inputs['labels'][0])): print(len(inputs['labels'])) print(i) fpr[i], tpr[i], _ = roc_curve(all_out_label_ids[:, i], preds[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(all_out_label_ids.ravel(), preds.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'roc_auc': roc_auc, 'precision': all_precision, 'recall': all_recall, 'f1': all_f1} else: result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'precision': all_precision, 'recall': all_recall, 'f1': all_f1} results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def query(self, examples, batch_size, do_evaluate=True, return_logits=False, do_recover=True, use_tqdm=True): if do_recover: examples = [self.recoverer.recover_example(x) for x in examples] dataset = self._prep_examples(examples) eval_sampler = SequentialSampler( dataset) # Makes sure order is correct eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=batch_size) # Eval! logger.info("***** Querying model *****") logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None example_idxs = None self.model.eval() if use_tqdm: eval_dataloader = tqdm(eval_dataloader, desc="Querying") for batch in eval_dataloader: batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if self.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = self.model(**inputs) inputs['example_idxs'] = batch[4] tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() example_idxs = inputs['example_idxs'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) example_idxs = np.append( example_idxs, inputs['example_idxs'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps logger.info(' eval_loss = %.6f', eval_loss) incorrect_example_indices = None if self.output_mode == "classification": pred_argmax = np.argmax(preds, axis=1) pred_labels = [ self.label_list[pred_argmax[i]] for i in range(len(examples)) ] incorrect_example_indices = set(example_idxs[np.not_equal( pred_argmax, out_label_ids)]) elif self.output_mode == "regression": preds = np.squeeze(preds) if do_evaluate: result = compute_metrics(self.task_name, pred_argmax, out_label_ids) output_eval_file = os.path.join( self.output_dir, "eval-{}.txt".format(self.task_name)) #print("Possible predictions: ", set(list(preds))) #priny("Model predictions: mean: {}, max: {}, min: {}".format(preds.mean(), preds.max(), preds.min())) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if return_logits: return preds else: return pred_labels
def evaluate(args, model, tokenizer, prefix="", dev_evaluate=False): # Loop to handle MNLI double evaluation (matched, mis-matched) if "mnli" in args.task_name and "mnli-mm" not in args.task_name: args.eval_task_names.append("mnli-mm") results = {} for eval_task in args.eval_task_names: # eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset, eval_labels, num_classes = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True,\ dev_evaluate=dev_evaluate) print("num_classes ", num_classes, "eval_labels ", eval_labels) print(eval_dataset) args.eval_batch_size = args.per_gpu_eval_batch_size # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): if args.hypothesis_only or args.focal_loss or args.poe_loss: inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3], 'h_ids': batch[4], 'h_attention_mask': batch[5] } elif args.hans_only: inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3], 'h_ids': batch[4], 'h_attention_mask': batch[5], 'p_ids': batch[6], 'p_attention_mask': batch[7], 'have_overlap': batch[8], 'overlap_rate': batch[9], 'subsequence': batch[10], 'constituent': batch[11] } else: inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs)["bert"] tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps max_preds = np.argmax(preds, axis=1) # convert 1,2 labels to 1 in case of binary dataset. if num_classes == 2 and args.binerize_eval: max_preds = binarize_preds(max_preds) out_label_ids = binarize_preds(out_label_ids) if eval_task in nli_task_names: eval_task_metric = "nli" elif eval_task.startswith("fever"): eval_task_metric = "fever" elif eval_task.startswith("HANS"): eval_task_metric = "hans" else: eval_task_metric = eval_task result = compute_metrics(eval_task_metric, max_preds, out_label_ids) if args.save_labels_file is not None: save_labels_file = args.save_labels_file + "_" + eval_task if args.output_label_format == "kaggle": write_in_kaggle_format(args, max_preds, eval_labels, save_labels_file, eval_task) elif args.output_label_format == "numpy": write_in_numpy_format(args, preds, save_labels_file) results[eval_task] = result["acc"] if eval_task.startswith("HANS"): results[eval_task + "_not-entailment"] = result["acc_0"] results[eval_task + "_entailment"] = result["acc_1"] print("results is ", result, " eval_task ", eval_task) return results, preds
def evaluate(args, model, tokenizer, epoch=0, is_test=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = args.task_name eval_output_dir = args.output_dir set_type = 'test' if is_test else 'dev' results = {} for task_id, eval_task in enumerate(eval_task_names): if is_test and not hasattr(processors[eval_task], 'get_test_examples'): continue eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, set_type) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info( "***** Running evaluation for {} on {} for epoch {} *****".format( eval_task, set_type, epoch)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 logits_all = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], # XLM don't use segment_ids 'labels': batch[3], 'task_id': task_id } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] # input_ids, input_mask, segment_ids, label_ids = batch # tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, task_id=task_id) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if logits_all is None: logits_all = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: logits_all = np.append(logits_all, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps output_mode = output_modes[eval_task] if output_mode in ["classification", "multi-choice"]: preds = np.argmax(logits_all, axis=1) elif output_mode == "regression": preds = np.squeeze(logits_all) result = compute_metrics(eval_task, preds, out_label_ids.reshape(-1)) results.update(result) output_eval_file = os.path.join( eval_output_dir, "eval_results_{}_{}.txt".format(eval_task, set_type)) with open(output_eval_file, "a") as writer: logger.info( "***** Eval results for {} on {} for epoch {} *****".format( eval_task, set_type, epoch)) writer.write( "***** Eval results for epoch {} *****\n".format(epoch)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) logger.info("\n") # get error idx correct_idx = np.argwhere(preds == out_label_ids).tolist() wrong_idx = np.argwhere(preds != out_label_ids).tolist() wrong_idx_dict = { 'correct': correct_idx, 'wrong': wrong_idx, 'preds': preds.tolist(), 'logits': logits_all.tolist(), 'labels': out_label_ids.tolist() } json.dump( wrong_idx_dict, open( os.path.join( eval_output_dir, "error_idx_{}_{}.json".format(eval_task, set_type)), 'w')) return results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task = args.task_name eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) args.eval_batch_size = args.per_gpu_eval_batch_size eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) result = compute_metrics(eval_task, preds, out_label_ids) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model, tokenizer, prefix="", eval_set='dev', save_aps=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, eval_set) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) if eval_set == 'train' and save_aps: #for getting the correct losses for each query args.eval_batch_size = 2 else: args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None all_losses = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] all_losses.append(tmp_eval_loss.mean().item()) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if args.debug_mode: break eval_loss = eval_loss / nb_eval_steps if args.task_name == "ms_v2" or args.task_name == "udc" or \ args.task_name == "mantis_10" or args.task_name == "mantis_50": preds = softmax(preds, axis=1) preds = preds[:, 1] elif args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) if save_aps: assert args.local_rank == -1 aps = compute_aps(preds, out_label_ids) output_eval_file = os.path.join(eval_output_dir, "aps_" + args.run_name) with open(output_eval_file, "w") as f: for ap in aps: f.write(str(ap) + "\n") output_eval_file = os.path.join(eval_output_dir, "losses_" + args.run_name) with open(output_eval_file, "w") as f: for loss in all_losses: f.write(str(loss) + "\n") output_eval_file = os.path.join(eval_output_dir, "preds_" + args.run_name) with open(output_eval_file, "w") as f: for pred in preds: f.write(str(pred) + "\n") negative_sampled_size = 2 preds_q_docs_avg = [] for i in range(0, len(preds), negative_sampled_size): preds_q_docs_avg.append( sum(preds[i:i + negative_sampled_size]) / negative_sampled_size) output_eval_file = os.path.join(eval_output_dir, "avg_preds_" + args.run_name) with open(output_eval_file, "w") as f: for avg in preds_q_docs_avg: f.write(str(avg) + "\n") result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results