def glue_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "cola": return {"mcc": matthews_corrcoef(labels, preds)} elif task_name == "sst-2": return {"acc": simple_accuracy(preds, labels)} elif task_name == "sst-2-orig": return {"acc": simple_accuracy(preds, labels)} elif task_name == "sst-2-glue": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mrpc": return acc_and_f1(preds, labels) elif task_name == "sts-b": return pearson_and_spearman(preds, labels) elif task_name == "qqp": return acc_and_f1(preds, labels) elif task_name == "snli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mnli-mm": return {"acc": simple_accuracy(preds, labels)} elif task_name == "qnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "rte": return {"acc": simple_accuracy(preds, labels)} elif task_name == "wnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "hans": return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name)
def _compute_metrics(self, p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) result = acc_and_f1(preds, p.label_ids) log.info(p) log.info("preds.size=" + str(len(preds)) + ", preds.sum=" + str(preds.sum()) + ", label.sum=" + str(p.label_ids.sum())) correct = ((preds == 0) * (p.label_ids == 0)).sum() a1 = correct / (len(p.label_ids) - p.label_ids.sum()) result['segment_acc'] = a1 return result
def _compute_metrics(self, p: TaskEvalPrediction) -> Dict: if self._for_cls: preds = np.argmax(p.predictions, axis=1) else: preds = np.squeeze(p.predictions) return acc_and_f1(preds, p.label_ids)
def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return acc_and_f1(preds, p.label_ids)
def evaluate(args, model, tokenizer, embedding, unk_token, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) # eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) # eval_task_names = ("mrpc",) # eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) eval_output_dir = args.output_dir results = {} # for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): # eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) eval_dataset = load_and_cache_examples(args, tokenizer, embedding, unk_token, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], 'terma_vec': batch[5], 'termb_vec': batch[6] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = acc_and_f1(preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results