def get_train_dataloader(self) -> DataLoader: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if not self.alternate: if is_tpu_available(): train_sampler = get_tpu_sampler(self.train_dataset) else: train_sampler = (RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)) data_loader = DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, collate_fn=self.data_collator.collate_batch, ) else: data_loader = DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, shuffle=False, sampler=None, collate_fn=self.data_collator.collate_batch, ) if is_tpu_available(): data_loader = pl.ParallelLoader( data_loader, [self.args.device]).per_device_loader(self.args.device) return data_loader
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: # We use the same batch_size as for eval. if is_tpu_available(): sampler = SequentialDistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) else: sampler = SequentialSampler(test_dataset) data_loader = DataLoader( test_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator.collate_batch, ) if is_tpu_available(): data_loader = pl.ParallelLoader( data_loader, [self.args.device]).per_device_loader(self.args.device) return data_loader
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None ) -> DataLoader: if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset sampler = get_tpu_sampler(eval_dataset) if is_tpu_available() else None batch_size = 1 if self.alternate else self.args.eval_batch_size data_loader = DataLoader( eval_dataset, sampler=sampler, batch_size=batch_size, shuffle=False, collate_fn=self.data_collator.collate_batch, ) if is_tpu_available(): data_loader = pl.ParallelLoader( data_loader, [self.args.device]).per_device_loader(self.args.device) return data_loader
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None ) -> DataLoader: if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset if self.args.use_bucket_iterator: bucket_boundaries = [0, 20, 30, 40, 50, 60, 70, 80, 90, 101] eval_sampler = BySequenceLengthSampler( eval_dataset, bucket_boundaries, batch_size=self.args.eval_batch_size, drop_last=False) data_loader = DataLoader( eval_dataset, batch_size=1, batch_sampler=eval_sampler, collate_fn=self.data_collator.collate_batch, num_workers=0, pin_memory=False) else: if is_tpu_available(): sampler = SequentialDistributedSampler( eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(eval_dataset) else: sampler = SequentialSampler(eval_dataset) data_loader = DataLoader( eval_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator.collate_batch, ) if is_tpu_available(): data_loader = pl.ParallelLoader( data_loader, [self.args.device]).per_device_loader(self.args.device) return data_loader
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None ) -> DataLoader: if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset if is_tpu_available(): sampler = SequentialDistributedSampler( eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(eval_dataset) else: sampler = SequentialSampler(eval_dataset) data_loader = DataLoader( eval_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator.collate_batch, ) return data_loader
def __init__( self, model: PreTrainedModel, args: TrainingArguments, model_args: ModelArguments, data_args: DataTrainingArguments, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args self.model_args = model_args self.data_args = data_args if data_collator is not None: self.data_collator = data_collator else: self.data_collator = DefaultDataCollator() self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers self.best_model_path = None if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True
def is_world_master(self) -> bool: """ This will be True only in one process, even in distributed mode, even when training on multiple machines. """ if is_tpu_available(): return xm.is_master_ordinal(local=False) else: return self.args.local_rank == -1 or torch.distributed.get_rank() == 0
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: # We use the same batch_size as for eval. sampler = get_tpu_sampler(test_dataset) if is_tpu_available() else None data_loader = DataLoader( test_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, shuffle=False, collate_fn=self.data_collator.collate_batch, ) if is_tpu_available(): data_loader = pl.ParallelLoader( data_loader, [self.args.device]).per_device_loader(self.args.device) return data_loader
def num_examples( self, dataloader: Union[DataLoader, "pl.PerDeviceLoader"]) -> int: """ Helper to get num of examples from a DataLoader, by accessing its Dataset. """ if is_tpu_available(): assert isinstance(dataloader, pl.PerDeviceLoader) return len(dataloader._loader._loader.dataset) else: return len(dataloader.dataset)
def get_train_dataloader(self) -> DataLoader: if self.args.use_bucket_iterator: print("\n\n\n\n USING BUCKET ITERATOR \n\n\n\n") bucket_boundaries = [0, 20, 30, 40, 50, 60, 70, 80, 90, 101] train_sampler = BySequenceLengthSampler( self.train_dataset, bucket_boundaries, batch_size=self.args.train_batch_size, drop_last=False) data_loader = DataLoader( self.train_dataset, batch_size=1, batch_sampler=train_sampler, collate_fn=self.data_collator.collate_batch, num_workers=0, pin_memory=False) else: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if is_tpu_available(): train_sampler = get_tpu_sampler(self.train_dataset) else: train_sampler = (RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)) data_loader = DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, collate_fn=self.data_collator.collate_batch, ) if is_tpu_available(): data_loader = pl.ParallelLoader( data_loader, [self.args.device]).per_device_loader(self.args.device) return data_loader
def save_model(self, output_dir: Optional[str] = None): """ Saving best-practices: if you use default names for the model, you can reload it using from_pretrained(). Will only save from the world_master process (unless in TPUs). """ if is_tpu_available(): self._save_tpu(output_dir) elif self.is_world_master(): self._save(output_dir)
def get_train_dataloader(self) -> DataLoader: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if is_tpu_available(): train_sampler = get_tpu_sampler(self.train_dataset) else: train_sampler = (RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)) data_loader = DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, collate_fn=self.data_collator.collate_batch, ) return data_loader
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput from transformers.training_args import TrainingArguments, is_tpu_available try: from apex import amp _has_apex = True except ImportError: _has_apex = False def is_apex_available(): return _has_apex if is_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl try: from torch.utils.tensorboard import SummaryWriter _has_tensorboard = True except ImportError: try: from tensorboardX import SummaryWriter _has_tensorboard = True except ImportError: _has_tensorboard = False
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only # multi-gpu eval if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): model = torch.nn.DataParallel(self.model) else: model = self.model model.to(self.args.device) if is_tpu_available(): batch_size = dataloader._loader._loader.batch_size else: batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds_t1: np.ndarray = None preds_t2: np.ndarray = None label_ids_t1: np.ndarray = None label_ids_t2: np.ndarray = None model.eval() for inputs in tqdm(dataloader, desc=description): has_labels = any( inputs.get(k) is not None for k in [ "labels", "labels_t1", "labels_t2", "lm_labels", "masked_lm_labels" ]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: if self.alternate: step_eval_loss, logits, task = outputs[:3] else: step_eval_loss, logits_t1, logits_t2 = outputs[:3] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if self.alternate: if not prediction_loss_only: if task == 0: if preds_t1 is None: preds_t1 = logits.detach().cpu().numpy() else: preds_t1 = np.append(preds_t1, logits.detach().cpu().numpy(), axis=0) if inputs.get("labels") is not None: if label_ids_t1 is None: label_ids_t1 = inputs["labels"].detach().cpu( ).numpy() else: label_ids_t1 = np.append( label_ids_t1, inputs["labels"].detach().cpu().numpy(), axis=0) elif task == 1: if preds_t2 is None: preds_t2 = logits.detach().cpu().numpy() else: preds_t2 = np.append(preds_t2, logits.detach().cpu().numpy(), axis=0) if inputs.get("labels") is not None: if label_ids_t2 is None: label_ids_t2 = inputs["labels"].detach().cpu( ).numpy() else: label_ids_t2 = np.append( label_ids_t2, inputs["labels"].detach().cpu().numpy(), axis=0) else: if not prediction_loss_only: if preds_t1 is None or preds_t2 is None: preds_t1 = logits_t1.detach().cpu().numpy() preds_t2 = logits_t1.detach().cpu().numpy() else: preds_t1 = np.append(preds_t1, logits_t1.detach().cpu().numpy(), axis=0) preds_t2 = np.append(preds_t2, logits_t2.detach().cpu().numpy(), axis=0) if inputs.get("labels_t1") is not None: if label_ids_t1 is None or label_ids_t2 is None: label_ids_t1 = inputs["labels_t1"].detach().cpu( ).numpy() label_ids_t2 = inputs["labels_t2"].detach().cpu( ).numpy() else: label_ids_t1 = np.append( label_ids_t1, inputs["labels_t1"].detach().cpu().numpy(), axis=0) label_ids_t2 = np.append( label_ids_t2, inputs["labels_t2"].detach().cpu().numpy(), axis=0) # if is_tpu_available() and preds is not None and label_ids is not None: # # tpu-comment: Get all predictions and labels from all worker shards of eval dataset # preds = xm.mesh_reduce("eval_preds", preds, np.concatenate) # label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids, np.concatenate) metrics = {} if self.compute_metrics is not None: if preds_t1 is not None and label_ids_t1 is not None: metrics["task 1"] = self.compute_metrics( EvalPrediction(predictions=preds_t1, label_ids=label_ids_t1)) if preds_t2 is not None and label_ids_t2 is not None: metrics["task 2"] = self.compute_metrics( EvalPrediction(predictions=preds_t2, label_ids=label_ids_t2)) if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return (PredictionOutput(predictions=preds_t1, label_ids=label_ids_t1, metrics=metrics), PredictionOutput(predictions=preds_t2, label_ids=label_ids_t2, metrics=metrics))
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] task_probs: Dict[str, torch.Tensor] = {} preds: Dict[str, torch.Tensor] = {} label_ids: Dict[str, torch.Tensor] = {} model.eval() if is_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) for inputs in tqdm(dataloader, desc=description): if description == 'Prediction': inputs.pop('labels') has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) task_id = inputs.pop('task_id') for k, v in inputs.items(): inputs[k] = v.to(self.args.device) inputs['task_id'] = task_id with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: probs = nn.functional.softmax(logits.detach(), dim=-1) pred_labels = logits.detach().argmax(dim=-1) if task_id not in preds: preds[task_id] = pred_labels task_probs[task_id] = probs else: task_probs[task_id] = torch.cat((task_probs[task_id], probs), dim=0) preds[task_id] = torch.cat((preds[task_id], pred_labels), dim=0) if inputs.get("labels") is not None: labels = inputs["labels"].detach() if task_id not in label_ids: label_ids[task_id] = labels else: label_ids[task_id] = torch.cat((label_ids[task_id], labels), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if not preds: preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if not label_ids: label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if not preds: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if not label_ids: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) metrics = {} if self.compute_metrics is not None and preds and label_ids: for task_id, task_preds in preds.items(): task_preds = task_preds.cpu().numpy() task_label_ids = label_ids[task_id].cpu().numpy() metrics[task_id] = self.compute_metrics( EvalPrediction(predictions=task_preds, label_ids=task_label_ids) ) metrics['eval_avg'] = sum(metrics.values()) / 3 if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{task_id_to_name[key]}"] = metrics.pop(key) return PredictionOutput(predictions=preds, task_probs=task_probs, label_ids=label_ids, metrics=metrics)
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = (self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1) else: t_total = int( (len(self.train_dataset) // self.args.train_batch_size) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device)) scheduler.load_state_dict( torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size( ) else: total_train_batch_size = (self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // ( len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") if self.args.evaluate_step_zero: self.evaluate() tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master()) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance( train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue temp_loss = tr_loss tr_loss += self._training_step(model, inputs, optimizer) if self.global_step % 100 == 0: print(tr_loss - temp_loss) self.tb_writer.add_scalar('train loss', tr_loss - temp_loss, self.global_step) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if self.args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or (self.global_step == 1 and self.args.logging_first_step): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0]) logging_loss = tr_loss self._log(logs) if self.args.evaluate_during_training: self.evaluate() if self.is_world_master(): if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join( self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) self._rotate_checkpoints() torch.save( optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save( scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" ) return TrainOutput(self.global_step, tr_loss / self.global_step)
def is_local_master(self) -> bool: if is_tpu_available(): return xm.is_master_ordinal(local=True) else: return self.args.local_rank in [-1, 0]
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only # multi-gpu eval if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): model = torch.nn.DataParallel(self.model) else: model = self.model model.to(self.args.device) if is_tpu_available(): batch_size = dataloader._loader._loader.batch_size else: batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] eval_tag_losses = [] eval_gen_losses = [] eval_cov_losses = [] preds = [] label_ids = [] model.eval() for inputs in tqdm(dataloader, desc=description): for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) step_eval_loss, logits = outputs[:2] other_loss = outputs[-1] eval_losses += [step_eval_loss.mean().item()] eval_tag_losses += [other_loss['tag_loss'].mean().item()] eval_gen_losses += [other_loss['gen_loss'].mean().item()] eval_cov_losses += [other_loss['cov_loss'].mean().item()] if not prediction_loss_only: preds.append(logits.detach().cpu().numpy().argmax(-1)) if inputs.get("tgt_token") is not None: label_ids.append( inputs["tgt_token"][:, 1:].detach().cpu().numpy()) if is_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset preds = xm.mesh_reduce("eval_preds", preds, np.concatenate) label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids, np.concatenate) if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) if len(eval_tag_losses) > 0: metrics["eval_tag_loss"] = np.mean(eval_tag_losses) if len(eval_gen_losses) > 0: metrics["eval_gen_loss"] = np.mean(eval_gen_losses) if len(eval_cov_losses) > 0: metrics["eval_cov_loss"] = np.mean(eval_cov_losses) if metrics["eval_cov_loss"] != 0: metrics["eval_loss"] = metrics["eval_tag_loss"] + metrics[ "eval_gen_loss"] # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. # if is_tpu_available(): # batch_size = dataloader._loader._loader.batch_size # else: # batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) # logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() for inputs in tqdm(dataloader, desc=description): has_labels = any( inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: if preds is None: preds = logits.detach() else: preds = torch.cat((preds, logits.detach()), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() else: label_ids = torch.cat( (label_ids, inputs["labels"].detach()), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat( preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat( label_ids, num_total_examples=self.num_examples(dataloader)) elif is_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: m = np.mean(eval_losses) metrics["eval_loss"] = m metrics["eval_perplexity"] = torch.exp(torch.tensor(m)).item() print("Perplexity: {}".format(metrics["eval_perplexity"])) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) logger.info(" Decode mode = %s", self.args.decode_mode) eval_losses: List[float] = [] model.eval() metric = ParsingMetric() if is_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [self.args.device]).per_device_loader(self.args.device) for inputs in tqdm(dataloader, desc=description): for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): step_eval_loss, rel_preds, arc_preds = model( **inputs, adapter_names=self.adapter_names) eval_losses += [step_eval_loss.mean().item()] mask = inputs["labels_arcs"].ne(self.model.config.pad_token_id) predictions_arcs = torch.argmax(arc_preds, dim=-1)[mask] labels_arcs = inputs["labels_arcs"][mask] predictions_rels, labels_rels = rel_preds[mask], inputs[ "labels_rels"][mask] predictions_rels = predictions_rels[torch.arange(len(labels_arcs)), labels_arcs] predictions_rels = torch.argmax(predictions_rels, dim=-1) metric.add(labels_arcs, labels_rels, predictions_arcs, predictions_rels) results = metric.get_metric() results[f"{description}_loss"] = np.mean(eval_losses) # Add predictions_rels to output, even though we are only interested in the metrics return PredictionOutput(predictions=predictions_rels, label_ids=None, metrics=results)