def evaluate(self, task_name): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,) eval_outputs_dirs = (self.output_dir, self.output_dir + '-MM') if task_name == 'mnli' else (self.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = self.load_and_cache_examples(eval_task, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=self.eval_batch_size) if self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc='Evaluating'): self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): input_ids = batch[0] attention_mask = batch[1] token_type_ids = batch[2] labels = batch[3] outputs = model(input_ids) outputs = torch.argmax(outputs, dim=-1).float() tmp_eval_loss = self.loss_fn(outputs, labels) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = torch.argmax(outputs, dim=-1).detach().cpu().numpy() out_label_ids = labels.detach().cpu().numpy() else: preds = np.append(preds, torch.argmax(outputs, dim=-1).detach().cpu().numpy()) out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy()) eval_loss = eval_loss / nb_eval_steps if self.output_mode == 'classification': #preds = np.argmax(preds, axis=1) pass elif self.output_mode == 'regression': preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: for key in sorted(result.keys()): writer.write(f'{key} = {result[key]}') return results
for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, out_label_ids) results.update(result) print(results)