def glue_compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif task_name == "sst-2":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "sst-2-orig":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "sst-2-glue":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mrpc":
        return acc_and_f1(preds, labels)
    elif task_name == "sts-b":
        return pearson_and_spearman(preds, labels)
    elif task_name == "qqp":
        return acc_and_f1(preds, labels)
    elif task_name == "snli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli-mm":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "qnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "rte":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "wnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "hans":
        return {"acc": simple_accuracy(preds, labels)}
    else:
        raise KeyError(task_name)
    def _compute_metrics(self, p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        result = acc_and_f1(preds, p.label_ids)
        log.info(p)
        log.info("preds.size=" + str(len(preds)) + ", preds.sum=" +
                 str(preds.sum()) + ", label.sum=" + str(p.label_ids.sum()))

        correct = ((preds == 0) * (p.label_ids == 0)).sum()
        a1 = correct / (len(p.label_ids) - p.label_ids.sum())

        result['segment_acc'] = a1
        return result
 def _compute_metrics(self, p: TaskEvalPrediction) -> Dict:
     if self._for_cls:
         preds = np.argmax(p.predictions, axis=1)
     else:
         preds = np.squeeze(p.predictions)
     return acc_and_f1(preds, p.label_ids)
Exemple #4
0
 def compute_metrics_fn(p: EvalPrediction):
     preds = np.argmax(p.predictions, axis=1)
     return acc_and_f1(preds, p.label_ids)
Exemple #5
0
def evaluate(args, model, tokenizer, embedding, unk_token, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    # eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    # eval_task_names = ("mrpc",)
    # eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
    eval_output_dir = args.output_dir

    results = {}
    # for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
    # eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
    eval_dataset = load_and_cache_examples(args,
                                           tokenizer,
                                           embedding,
                                           unk_token,
                                           evaluate=True)

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3],
                'terma_vec': batch[5],
                'termb_vec': batch[6]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet'
                ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids,
                                      inputs['labels'].detach().cpu().numpy(),
                                      axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args.output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif args.output_mode == "regression":
        preds = np.squeeze(preds)

    result = acc_and_f1(preds, out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results