Exemple #1
0
    def evaluate(self, task_name):
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,)
        eval_outputs_dirs = (self.output_dir, self.output_dir + '-MM') if task_name == 'mnli' else (self.output_dir,)

        results = {}
        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
            eval_dataset = self.load_and_cache_examples(eval_task, evaluate=True)
            if not os.path.exists(eval_output_dir):
                os.makedirs(eval_output_dir)
            eval_sampler = SequentialSampler(eval_dataset)
            eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.eval_batch_size)

            if self.n_gpu > 1:
                self.model = torch.nn.DataParallel(self.model)

            eval_loss = 0.0
            nb_eval_steps = 0
            preds = None
            out_label_ids = None

            for batch in tqdm(eval_dataloader, desc='Evaluating'):
                self.model.eval()
                batch = tuple(t.to(self.device) for t in batch)

                with torch.no_grad():
                    input_ids = batch[0]
                    attention_mask = batch[1]
                    token_type_ids = batch[2]
                    labels = batch[3]

                    outputs = model(input_ids)
                    outputs = torch.argmax(outputs, dim=-1).float()
                    tmp_eval_loss = self.loss_fn(outputs, labels)
                    eval_loss += tmp_eval_loss.mean().item()

                nb_eval_steps += 1
                if preds is None:
                    preds = torch.argmax(outputs, dim=-1).detach().cpu().numpy()
                    out_label_ids = labels.detach().cpu().numpy()
                else:
                    preds = np.append(preds, torch.argmax(outputs, dim=-1).detach().cpu().numpy())
                    out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy())

            eval_loss = eval_loss / nb_eval_steps
            if self.output_mode == 'classification':
                #preds = np.argmax(preds, axis=1)
                pass
            elif self.output_mode == 'regression':
                preds = np.squeeze(preds)
            result = compute_metrics(eval_task, preds, out_label_ids)
            results.update(result)

            output_eval_file = os.path.join(eval_output_dir, 'eval_results.txt')
            with open(output_eval_file, 'w') as writer:
                for key in sorted(result.keys()):
                    writer.write(f'{key} = {result[key]}')
        return results
Exemple #2
0
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
        if model_type != "distilbert":
            inputs["token_type_ids"] = (
                batch[2] if model_type in ["bert", "xlnet", "albert"] else None
            )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
        outputs = model(**inputs)
        tmp_eval_loss, logits = outputs[:2]

        eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs["labels"].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
if output_mode == "classification":
    preds = np.argmax(preds, axis=1)
elif output_mode == "regression":
    preds = np.squeeze(preds)
result = compute_metrics(task_name, preds, out_label_ids)
results.update(result)

print(results)