Ejemplo n.º 1
0
 def eval_dev(self, dev_data, eval_config, n_gpu):
     self.model.eval()
     results = self.eval(
         dev_data,
         per_gpu_eval_batch_size=eval_config.per_gpu_eval_batch_size,
         n_gpu=n_gpu)
     predictions = np.argmax(results['logits'], axis=1)
     scores = {}
     metrics = eval_config.metrics if eval_config.metrics else ['acc']
     for metric in metrics:
         if metric == 'acc':
             scores[metric] = simple_accuracy(predictions,
                                              results['labels'])
         elif metric == 'f1':
             scores[metric] = f1_score(results['labels'], predictions)
         elif metric == 'f1-macro':
             scores[metric] = f1_score(results['labels'],
                                       predictions,
                                       average='macro')
         elif metric == 'em':
             scores[metric] = exact_match(predictions, results['labels'],
                                          results['question_ids'])
         else:
             raise ValueError(f"Metric '{metric}' not implemented")
     return scores
Ejemplo n.º 2
0
def evaluate(model: TransformerModelWrapper,
             eval_data: List[InputExample],
             config: EvalConfig) -> Dict:

    metrics = config.metrics if config.metrics else ['acc']
    results = model.eval(eval_data=eval_data,
                         per_gpu_eval_batch_size=config.per_gpu_eval_batch_size,
                         n_gpu=config.n_gpu)
    # print("results['logits'].shape=", results['logits'].shape)
    predictions = np.argmax(results['logits'], axis=1)
    scores = {}
    for metric in metrics:
        if metric == 'acc':
            scores[metric] = simple_accuracy(predictions, results['labels'])
        elif metric == 'f1':
            scores[metric] = f1_score(results['labels'], predictions)
        elif metric == 'f1-macro':
            scores[metric] = f1_score(results['labels'], predictions, average='macro')
        elif metric == 'em':
            scores[metric] = exact_match(predictions, results['labels'], results['question_ids'])
        else:
            raise ValueError(f"Metric '{metric}' not implemented")
    results['scores'] = scores
    results['predictions'] = predictions
    return results
Ejemplo n.º 3
0
def evaluate(
    model: TransformerModelWrapper,
    eval_data: List[InputExample],
    config: EvalConfig,
    priming_data: List[InputExample] = None,
    local_rank=-1,
) -> Dict:
    """
    Evaluate a model.

    :param model: the model to evaluate
    :param eval_data: the examples for evaluation
    :param config: the evaluation config
    :param priming_data: an optional list of priming data to use
    :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores
    """

    if config.priming:
        for example in eval_data:
            example.meta["priming_data"] = priming_data

    metrics = config.metrics if config.metrics else ["acc"]
    device = torch.device(config.device if config.device else "cuda" if torch.
                          cuda.is_available() else "cpu")

    model.model.to(device)
    results = model.eval(
        eval_data,
        device,
        per_gpu_eval_batch_size=config.per_gpu_eval_batch_size,
        n_gpu=config.n_gpu,
        decoding_strategy=config.decoding_strategy,
        priming=config.priming,
        local_rank=local_rank,
    )

    predictions = np.argmax(results["logits"], axis=1)
    scores = {}

    for metric in metrics:
        if metric == "acc":
            scores[metric] = simple_accuracy(predictions, results["labels"])
        elif metric == "f1":
            scores[metric] = f1_score(results["labels"], predictions)
        elif metric == "f1-macro":
            scores[metric] = f1_score(results["labels"],
                                      predictions,
                                      average="macro")
        elif metric == "em":
            scores[metric] = exact_match(predictions, results["labels"],
                                         results["question_ids"])
        else:
            raise ValueError(f"Metric '{metric}' not implemented")

    results["scores"] = scores
    results["predictions"] = predictions
    return results
Ejemplo n.º 4
0
    def in_training_eval(self, eval_kwargs):
        eval_results = self.eval(**eval_kwargs)
        predictions = np.argmax(eval_results["logits"], axis=1)

        if eval_kwargs["metrics"]:
            if "f1" in eval_kwargs["metrics"]:
                score = f1_score(eval_results["labels"], predictions)
            elif "f1-macro" in eval_kwargs["metrics"]:
                score = f1_score(eval_results["labels"],
                                 predictions,
                                 average="macro")
            elif "em" in eval_kwargs["metrics"]:
                score = exact_match(predictions, eval_results["labels"],
                                    eval_results["question_ids"])
            else:
                score = simple_accuracy(predictions, eval_results["labels"])
        else:
            score = simple_accuracy(predictions, eval_results["labels"])

        return score
Ejemplo n.º 5
0
def evaluate(model: TransformerModelWrapper,
             eval_data: List[InputExample],
             config: EvalConfig,
             priming_data: List[InputExample] = None) -> Dict:
    """
    Evaluate a model.

    :param model: the model to evaluate
    :param eval_data: the examples for evaluation
    :param config: the evaluation config
    :param priming_data: an optional list of priming data to use
    :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores
    """

    if config.priming:
        for example in eval_data:
            example.meta['priming_data'] = priming_data

    metrics = config.metrics if config.metrics else ['acc']
    device = torch.device(config.device if config.device else "cuda" if torch.
                          cuda.is_available() else "cpu")

    model.model.to(device)
    results = model.eval(
        eval_data,
        device,
        per_gpu_eval_batch_size=config.per_gpu_eval_batch_size,
        n_gpu=config.n_gpu,
        decoding_strategy=config.decoding_strategy,
        priming=config.priming)

    predictions = np.argmax(results['logits'], axis=1)
    scores = {}

    for metric in metrics:
        if metric == 'acc':
            scores[metric] = simple_accuracy(predictions, results['labels'])
        elif metric == 'f1':
            scores[metric] = f1_score(results['labels'], predictions)
        elif metric == 'f1-macro':
            scores[metric] = f1_score(results['labels'],
                                      predictions,
                                      average='macro')
        elif metric == 'em':
            scores[metric] = exact_match(predictions, results['labels'],
                                         results['question_ids'])
        elif metric == 'dist-loss':
            if eval_data[0].logits is not None:
                scores[metric] = distillation_loss(
                    torch.tensor(results['logits']),
                    torch.stack([
                        torch.tensor(ex.logits, dtype=torch.float32)
                        for ex in eval_data
                    ]), config.temperature)
            else:
                scores[metric] = 0.
        else:
            raise ValueError(f"Metric '{metric}' not implemented")

    results['scores'] = scores
    results['predictions'] = predictions
    return results