def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} inputs['token_type_ids'] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return results
def test(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) test_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) test_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for test_task, test_output_dir in zip(test_task_names, test_outputs_dirs): test_dataset = load_and_cache_examples(args, test_task, tokenizer, data_type='test') if not os.path.exists(test_output_dir) and args.local_rank in [-1, 0]: os.makedirs(test_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Test! logger.info("***** Running test {} *****".format(prefix)) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(test_dataloader), desc="Testing") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], 'token_type_ids': batch[2] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(test_task, preds, out_label_ids) results.update(result) logger.info("***** Test results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) classreport = ClassReport([ 'Joint', 'Sequence', 'Progression', "Contrast", "Supplement", "Cause-Result", "Result-Cause", "Background", "Behavior-Purpose", "Purpose-Behavior", "Elaboration", "Summary", "Evaluation", "Statement-Illustration", "Illustration-Statement" ]) classreport(preds, out_label_ids) logger.info("%s : %s", classreport.name(), classreport.value()) return results