def _setup_wandb(self): """ Setup the optional Weights & Biases (`wandb`) integration. One can override this method to customize the setup if needed. Find more information at https://docs.wandb.com/huggingface You can also override the following environment variables: Environment: WANDB_WATCH: (Optional, ["gradients", "all", "false"]) "gradients" by default, set to "false" to disable gradient logging or "all" to log gradients and parameters WANDB_PROJECT: (Optional): str - "huggingface" by default, set this to a custom string to store results in a different project WANDB_DISABLED: (Optional): boolean - defaults to false, set to "true" to disable wandb entirely """ if self.is_world_master(): logger.info( 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"' ) wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) # keep track of model topology and gradients, unsupported on TPU if not is_torch_tpu_available() and os.getenv("WANDB_WATCH") != "false": wandb.watch( self.model, log=os.getenv("WANDB_WATCH", "gradients"), log_freq=max(100, self.args.logging_steps) )
def __init__( self, model: PreTrainedModel, args: TrainingArguments, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args self.data_collator = data_collator if data_collator is not None else default_data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_torch_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)): self.data_collator = self.data_collator.collate_batch warnings.warn( ( "The `data_collator` should now be a simple callable (function, class with `__call__`), classes " + "with a `collate_batch` are deprecated and won't be supported in a future version." ), FutureWarning, )
def _setup_devices(self) -> Tuple["torch.device", int]: logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") n_gpu = 0 elif is_torch_tpu_available(): device = xm.xla_device() n_gpu = 0 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device, n_gpu
def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]: if isinstance(self.train_dataset, torch.utils.data.IterableDataset): return None elif is_torch_tpu_available(): return get_tpu_sampler(self.train_dataset) else: return (RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset))
def is_world_master(self) -> bool: """ This will be True only in one process, even in distributed mode, even when training on multiple machines. """ if is_torch_tpu_available(): return xm.is_master_ordinal(local=False) else: return self.args.local_rank == -1 or torch.distributed.get_rank() == 0
def save_model(self, output_dir: Optional[str] = None): """ Will save the model, so you can reload it using :obj:`from_pretrained()`. Will only save from the world_master process (unless in TPUs). """ if is_torch_tpu_available(): self._save_tpu(output_dir) elif self.is_world_master(): self._save(output_dir)
def save_model(self, output_dir: Optional[str] = None): """ Saving best-practices: if you use default names for the model, you can reload it using from_pretrained(). Will only save from the world_master process (unless in TPUs). """ if is_torch_tpu_available(): self._save_tpu(output_dir) elif self.is_world_master(): self._save(output_dir)
def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]: if is_torch_tpu_available() and xm.xrt_world_size() > 1: num_replicas = xm.xrt_world_size() rank = xm.get_ordinal() elif self.args.local_rank != -1: num_replicas = torch.distributed.get_world_size() rank = self.args.local_rank else: num_replicas = 1 rank = 0 return MultiTaskBatchSampler(self.dataset_sizes, self.args.train_batch_size, self.args.temperature, rank=rank, num_replicas=num_replicas)
def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]: if isinstance(self.train_dataset, torch.utils.data.IterableDataset): return None elif is_torch_tpu_available(): return get_tpu_sampler(self.train_dataset) else: if self.args.sortish_sampler: self.train_dataset.make_sortish_sampler( self.args.per_device_train_batch_size, distributed=self.args.n_gpu > 1) return (RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset))
def _get_eval_sampler( self, eval_dataset: Dataset ) -> Optional[torch.utils.data.sampler.Sampler]: if isinstance(eval_dataset, torch.utils.data.IterableDataset): return None elif is_torch_tpu_available(): return SequentialDistributedSampler( eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.args.local_rank != -1: return SequentialDistributedSampler(eval_dataset) else: return SequentialSampler(eval_dataset)
def _setup_wandb(self): if self.is_world_master() and self.wandb_args is not None: wandb.init(project=self.wandb_args["project_name"], name=self.wandb_args["run_name"], id=self.wandb_args["run_name"], dir=tune.get_trial_dir(), config=vars(self.args), reinit=True, allow_val_change=True, resume=self.wandb_args["run_name"]) # keep track of model topology and gradients, unsupported on TPU if not is_torch_tpu_available( ) and self.wandb_args["watch"] != "false": wandb.watch(self.model, log=self.wandb_args["watch"], log_freq=max(100, self.args.logging_steps))
def setup(self, args, state, model, reinit, **kwargs): if self._wandb is None: return self._initialized = True if state.is_world_process_zero: # Edit run name if doing # hyperparameter search trial_name = state.trial_name init_args = {} if trial_name is not None: run_name = trial_name init_args["group"] = args.run_name else: run_name = args.run_name # Get current Trainer config combined_dict = {**args.to_sanitized_dict()} if hasattr(model, "config") and model.config is not None: model_config = model.config.to_dict() combined_dict = {**model_config, **combined_dict} # Initialize the wandb run self._wandb.init( project=os.getenv("WANDB_PROJECT", "squad-qa"), group=os.getenv("WANDB_RUN_GROUP", None), config=combined_dict, name=run_name, reinit=True, **init_args, ) # Keep track of model topology and gradients (unsupported on TPU) if not is_torch_tpu_available() and os.getenv( "WANDB_WATCH") != "false": self._wandb.watch( model, log=os.getenv("WANDB_WATCH", "gradients"), log_freq=max(100, args.logging_steps), ) # Log outputs self._log_model = os.getenv( "WANDB_LOG_MODEL", "FALSE").upper() in ENV_VARS_TRUE_VALUES.union({"TRUE"})
def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") self._n_gpu = 0 elif is_torch_tpu_available(): device = xm.xla_device() self._n_gpu = 0 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. self._n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs # # deepspeed performs its own DDP internally, and requires the program to be started with: # deepspeed ./program.py # rather than: # python -m torch.distributed.launch --nproc_per_node=2 ./program.py if self.deepspeed: from .integrations import is_deepspeed_available if not is_deepspeed_available(): raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") import deepspeed deepspeed.init_distributed() else: torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) self._n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: # We use the same batch_size as for eval. if is_torch_tpu_available(): sampler = SequentialDistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal() ) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) else: sampler = SequentialSampler(test_dataset) data_loader = DataLoader( test_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, ) return data_loader
def get_train_dataloader(self) -> DataLoader: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if is_torch_tpu_available(): train_sampler = get_tpu_sampler(self.train_dataset) else: train_sampler = ( RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset) ) data_loader = DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, ) return data_loader
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset if is_torch_tpu_available(): sampler = SequentialDistributedSampler( eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal() ) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(eval_dataset) else: sampler = SequentialSampler(eval_dataset) data_loader = DataLoader( eval_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, ) return data_loader
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if hasattr(self, "_prediction_loop"): warnings.warn( "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.", FutureWarning, ) return self._prediction_loop(dataloader, description, prediction_loss_only=prediction_loss_only) prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) ''' assert not getattr( self.model.config, "output_attentions", False ), "The prediction loop does not work with `output_attentions=True`." assert not getattr( self.model.config, "output_hidden_states", False ), "The prediction loop does not work with `output_hidden_states=True`." ''' model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. ''' batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) ''' eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: self._past = None disable_tqdm = not self.is_local_process_zero() or self.args.disable_tqdm for inputs in tqdm(dataloader, desc=description, disable=disable_tqdm): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only) batch_size = inputs[list(inputs.keys())[0]].shape[0] if loss is not None: eval_losses.extend([loss] * batch_size) if logits is not None: preds = logits if preds is None else nested_concat(preds, logits, dim=0) if labels is not None: label_ids = labels if label_ids is None else nested_concat(label_ids, labels, dim=0) if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = nested_xla_mesh_reduce(preds, "eval_preds") if label_ids is not None: label_ids = nested_xla_mesh_reduce(label_ids, "eval_label_ids") if eval_losses is not None: eval_losses = xm.mesh_reduce("eval_losses", torch.tensor(eval_losses), torch.cat).tolist() # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = nested_numpify(preds) if label_ids is not None: label_ids = nested_numpify(label_ids) if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: if self.args.local_rank != -1: metrics["eval_loss"] = ( distributed_broadcast_scalars(eval_losses, num_total_examples=self.num_examples(dataloader)) .mean() .item() ) else: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the hyperparameter dictionary for hyperparameter search. """ # This might change the seed so needs to run first. self._hp_search_setup(trial) # Model re-init if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. set_seed(self.args.seed) model = self.model_init() self.model = model.to(self.args.device) # Reinitializes optimizer and scheduler self.optimizer, self.lr_scheduler = None, None # Data loader and number of training steps train_dataloader = self.get_train_dataloader() num_update_steps_per_epoch = len(train_dataloader) // self.args.gradient_accumulation_steps num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = self.args.max_steps // num_update_steps_per_epoch + int( self.args.max_steps % num_update_steps_per_epoch > 0 ) else: t_total = int(num_update_steps_per_epoch * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs self.args.max_steps = t_total self.create_optimizer_and_scheduler(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states self.optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) self.lr_scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16 and _use_apex: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split(os.path.sep)[0]) epochs_trained = self.global_step // num_update_steps_per_epoch steps_trained_in_current_epoch = self.global_step % (num_update_steps_per_epoch) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss_sum = 0.0 loss_sum = defaultdict(float) best = {self.best_metric: None} model.zero_grad() disable_tqdm = self.args.disable_tqdm or not self.is_local_process_zero() train_pbar = trange(epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=disable_tqdm) for epoch in range(epochs_trained, int(np.ceil(num_train_epochs))): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = parallel_loader else: epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None epoch_pbar = tqdm(epoch_iterator, desc="Iteration", disable=disable_tqdm) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 epoch_pbar.update(1) continue model.train() inputs = self._prepare_inputs(inputs) inputs["output_attentions"] = self.length_drop_args.length_config is not None layer_config = sample_layer_configuration( model.config.num_hidden_layers, layer_dropout_prob=self.length_drop_args.layer_dropout_prob, layer_dropout=0, ) inputs["layer_config"] = layer_config inputs["length_config"] = self.length_drop_args.length_config outputs = model(**inputs) # Save past state if it exists if self.args.past_index >= 0: self._past = outputs[self.args.past_index] task_loss = self.div_loss(outputs[0]) if self.length_drop_args.length_adaptive: loss_sum["full"] += task_loss.item() loss = task_loss if self.length_drop_args.length_adaptive: loss = loss / (self.length_drop_args.num_sandwich + 2) tr_loss_sum += loss.item() if self.args.fp16 and _use_native_amp: self.scaler.scale(loss).backward() elif self.args.fp16 and _use_apex: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # inplace distillation if self.length_drop_args.length_adaptive: logits = outputs[1].detach() for i in range(self.length_drop_args.num_sandwich + 1): inputs["output_attentions"] = True layer_config = sample_layer_configuration( model.config.num_hidden_layers, layer_dropout_prob=self.length_drop_args.layer_dropout_prob, layer_dropout=(self.length_drop_args.layer_dropout_bound if i == 0 else None), layer_dropout_bound=self.length_drop_args.layer_dropout_bound, ) inputs["layer_config"] = layer_config length_config = sample_length_configuration( self.args.max_seq_length, model.config.num_hidden_layers, layer_config, length_drop_ratio=(self.length_drop_args.length_drop_ratio_bound if i == 0 else None), length_drop_ratio_bound=self.length_drop_args.length_drop_ratio_bound, ) inputs["length_config"] = length_config outputs_sub = model(**inputs) task_loss_sub = self.div_loss(outputs_sub[0]) if i == 0: loss_sum["smallest"] += task_loss_sub.item() loss_sum["sub"] += 0 else: loss_sum["sub"] += task_loss_sub.item() / self.length_drop_args.num_sandwich logits_sub = outputs_sub[1] loss_fct = KLDivLoss(reduction="batchmean") kl_loss = loss_fct(F.log_softmax(logits, -1), F.softmax(logits_sub, -1)) loss = self.div_loss(kl_loss) loss_sum["kl"] += loss.item() / (self.length_drop_args.num_sandwich + 1) loss = loss / (self.length_drop_args.num_sandwich + 2) tr_loss_sum += loss.item() if self.args.fp16 and _use_native_amp: self.scaler.scale(loss).backward() elif self.args.fp16 and _use_apex: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps (step + 1) == len(epoch_iterator) <= self.args.gradient_accumulation_steps ): if self.args.fp16 and _use_native_amp: self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) elif self.args.fp16 and _use_apex: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(self.optimizer) elif self.args.fp16 and _use_native_amp: self.scaler.step(self.optimizer) self.scaler.update() else: self.optimizer.step() self.lr_scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): # backward compatibility for pytorch schedulers lr = ( self.lr_scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else self.lr_scheduler.get_lr()[0] ) loss = tr_loss_sum / self.args.logging_steps tr_loss_sum = 0.0 logs = {"lr": lr, "loss": loss} log_str = f"[{self.global_step:5d}] lr {lr:g} | loss {loss:2.3f}" for key, value in loss_sum.items(): value /= self.args.logging_steps loss_sum[key] = 0.0 logs[f"{key}_loss"] = value log_str += f" | {key}_loss {value:2.3f}" self.log(logs, "train") logger.info(log_str) ''' if ( self.args.evaluation_strategy == EvaluationStrategy.STEPS and self.global_step % self.args.eval_steps == 0 ): results = self.evaluate() self._report_to_hp_search(trial, epoch, results) ''' if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert ( model.module is self.model ), f"Module {model.module} should be a reference to self.model" else: assert model is self.model, f"Model {model} should be a reference to self.model" if self.args.evaluate_during_training: results = self.evaluate() results = {k[5:]: v for k, v in results.items() if k.startswith("eval_")} self.log(results, "dev") msg = " | ".join([f"{k} {v:.3f}" for k, v in results.items()]) logger.info(f" [{self.global_step:5d}] {msg}") # Save model checkpoint if self.args.save_only_best: output_dirs = [] else: checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}" if self.hp_search_backend is not None and trial is not None: run_id = ( trial.number if self.hp_search_backend == HPSearchBackend.OPTUNA else tune.get_trial_id() ) checkpoint_folder += f"-run-{run_id}" output_dirs = [os.path.join(self.args.output_dir, checkpoint_folder)] if self.args.evaluate_during_training: if best[self.best_metric] is None or results[self.best_metric] > best[self.best_metric]: logger.info("Congratulations, best model so far!") output_dirs.append(os.path.join(self.args.output_dir, "checkpoint-best")) best = results for output_dir in output_dirs: self.save_model(output_dir) if self.is_world_master() and self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) if self.is_world_process_zero(): self._rotate_checkpoints(use_mtime=True) ''' if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_process_zero(): torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) ''' epoch_pbar.update(1) if 0 < self.args.max_steps <= self.global_step: break epoch_pbar.close() train_pbar.update(1) ''' if self.args.evaluation_strategy == EvaluationStrategy.EPOCH: results = self.evaluate() self._report_to_hp_search(trial, epoch, results) ''' if self.args.tpu_metrics_debug or self.args.debug: if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) else: logger.warning( "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) if 0 < self.args.max_steps <= self.global_step: break train_pbar.close() if self.tb_writer: self.tb_writer.close() if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return self.global_step, best
_use_native_amp = False _use_apex = False # Check if Pytorch version >= 1.6 to switch between Native AMP and Apex if version.parse(torch.__version__) < version.parse("1.6"): from transformers.file_utils import is_apex_available if is_apex_available(): from apex import amp _use_apex = True else: _use_native_amp = True if is_torch_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl if is_tensorboard_available(): try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter if is_wandb_available(): import wandb if is_comet_available(): import comet_ml
def _save_checkpoint(self, model, trial, metrics=None): """ Compared to original implementation, we change the saving policy to only save the best-validation checkpoints. """ # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we # want to save. assert _model_unwrap(model) is self.model, "internal model should be a reference to self.model" # Determine the new best metric / best model checkpoint if metrics is not None and self.args.metric_for_best_model is not None: metric_to_check = self.args.metric_for_best_model if not metric_to_check.startswith("eval_"): metric_to_check = f"eval_{metric_to_check}" metric_value = metrics[metric_to_check] operator = np.greater if self.args.greater_is_better else np.less if ( self.state.best_metric is None or self.state.best_model_checkpoint is None or operator(metric_value, self.state.best_metric) ): output_dir = self.args.output_dir self.state.best_metric = metric_value self.state.best_model_checkpoint = output_dir # Only save model when it is the best one self.save_model(output_dir) if self.deepspeed: self.deepspeed.save_checkpoint(output_dir) # Save optimizer and scheduler if self.sharded_dpp: self.optimizer.consolidate_state_dict() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) elif self.is_world_process_zero() and not self.deepspeed: # deepspeed.save_checkpoint above saves model/optim/sched torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) # Save the Trainer state if self.is_world_process_zero(): self.state.save_to_json(os.path.join(output_dir, "trainer_state.json")) else: # Save model checkpoint checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" if self.hp_search_backend is not None and trial is not None: if self.hp_search_backend == HPSearchBackend.OPTUNA: run_id = trial.number else: from ray import tune run_id = tune.get_trial_id() run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}" output_dir = os.path.join(self.args.output_dir, run_name, checkpoint_folder) else: output_dir = os.path.join(self.args.output_dir, checkpoint_folder) self.store_flos() self.save_model(output_dir) if self.deepspeed: self.deepspeed.save_checkpoint(output_dir) # Save optimizer and scheduler if self.sharded_dpp: self.optimizer.consolidate_state_dict() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) elif self.is_world_process_zero() and not self.deepspeed: # deepspeed.save_checkpoint above saves model/optim/sched torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) # Save the Trainer state if self.is_world_process_zero(): self.state.save_to_json(os.path.join(output_dir, "trainer_state.json")) # Maybe delete some older checkpoints. if self.is_world_process_zero(): self._rotate_checkpoints(use_mtime=True)
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. NOTE: One issue is on the size of prediction and labels. For current code, it considers all the prediction and labels in different batch have same length of sequence. This is not true for our application. To make this more general, I will reformat the predictions and labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None preds_size: torch.Tensor = None label_ids: torch.Tensor = None label_size: torch.Tensor = None model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) for inputs in tqdm(dataloader, desc=description): has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: # Change the way of concat # We need to make sure that the size of preds and labels is (batch_size, sequence_length) if preds is None: preds = logits.detach() preds_size = preds.new_full(size=preds.size()[:1], fill_value=preds.size(1)).detach() preds = preds.view(-1) else: preds_size = torch.cat((preds_size, logits.new_full(size=logits.size()[:1], fill_value=logits.size(1)).detach()), dim=0) preds = torch.cat((preds, logits.detach().view(-1)), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() label_size = label_ids.new_full(size=label_ids.size()[:1], fill_value=label_ids.size(1)).detach() label_ids = label_ids.view(-1) else: label_size = torch.cat((label_size, inputs["labels"].new_full(size=inputs["labels"].size()[:1], fill_value=inputs["labels"].size(1)).detach()), dim=0) label_ids = torch.cat((label_ids, inputs["labels"].detach().view(-1)), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: # preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) preds, preds_size = self.distributed_concat_with_size(preds, preds_size, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: # label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) label_ids, label_size = self.distributed_concat_with_size(label_ids, label_size, num_total_examples=self.num_examples(dataloader)) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset # NOTE: We do not modify this for now. if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() preds_size = preds_size.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() label_size = label_size.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: # metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) metrics = self.compute_metrics(EvalPredictionWithSize(predictions=preds, predictions_size=preds_size, label_ids=label_ids, label_size=label_size)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) # return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) return PredictionOutputWithSize(predictions=preds, predictions_size=preds_size, label_ids=label_ids, label_size=label_size, metrics=metrics)
def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the hyperparameter dictionary for hyperparameter search. The main difference between ours and Huggingface's original implementation is that we also load model_args when reloading best checkpoints for evaluation. """ # This might change the seed so needs to run first. self._hp_search_setup(trial) # Model re-init if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. set_seed(self.args.seed) model = self.call_model_init(trial) if not self.is_model_parallel: model = model.to(self.args.device) self.model = model self.model_wrapped = model # Reinitializes optimizer and scheduler self.optimizer, self.lr_scheduler = None, None # Keeping track whether we can can len() on the dataset or not train_dataset_is_sized = isinstance(self.train_dataset, collections.abc.Sized) # Data loader and number of training steps train_dataloader = self.get_train_dataloader() # Setting up training control variables: # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps if train_dataset_is_sized: num_update_steps_per_epoch = len(train_dataloader) // self.args.gradient_accumulation_steps num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) if self.args.max_steps > 0: max_steps = self.args.max_steps num_train_epochs = self.args.max_steps // num_update_steps_per_epoch + int( self.args.max_steps % num_update_steps_per_epoch > 0 ) else: max_steps = math.ceil(self.args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(self.args.num_train_epochs) else: # see __init__. max_steps is set when the dataset has no __len__ max_steps = self.args.max_steps num_train_epochs = 1 num_update_steps_per_epoch = max_steps if self.args.deepspeed: model, optimizer, lr_scheduler = init_deepspeed(self, num_training_steps=max_steps) self.model = model.module self.model_wrapped = model # will get further wrapped in DDP self.deepspeed = model # DeepSpeedEngine object self.optimizer = optimizer self.lr_scheduler = lr_scheduler else: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(model_path) model = self.model_wrapped # Mixed precision training with apex (torch < 1.6) if self.use_apex: model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.sharded_dpp: model = ShardedDDP(model, self.optimizer) elif self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=( not getattr(model.config, "gradient_checkpointing", False) if isinstance(model, PreTrainedModel) else True ), ) # find_unused_parameters breaks checkpointing as per # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021 # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model # important: at this point: # self.model is the Transformers Model # self.model_wrapped is DDP(Transformers Model), DDP(Deepspeed(Transformers Model)), etc. # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) num_examples = ( self.num_examples(train_dataloader) if train_dataset_is_sized else total_train_batch_size * self.args.max_steps ) logger.info("***** Running training *****") logger.info(f" Num examples = {num_examples}") logger.info(f" Num Epochs = {num_train_epochs}") logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") logger.info(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {max_steps}") self.state.epoch = 0 start_time = time.time() epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path and os.path.isfile(os.path.join(model_path, "trainer_state.json")): self.state = TrainerState.load_from_json(os.path.join(model_path, "trainer_state.json")) epochs_trained = self.state.global_step // num_update_steps_per_epoch if not self.args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) steps_trained_in_current_epoch *= self.args.gradient_accumulation_steps else: steps_trained_in_current_epoch = 0 logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(f" Continuing training from epoch {epochs_trained}") logger.info(f" Continuing training from global step {self.state.global_step}") if not self.args.ignore_data_skip: logger.info( f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " "batches in the first epoch." ) # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None self.state.trial_params = hp_params(trial) if trial is not None else None # This should be the same if the state has been saved but in case the training arguments changed, it's safer # to set this after the load. self.state.max_steps = max_steps self.state.num_train_epochs = num_train_epochs self.state.is_local_process_zero = self.is_local_process_zero() self.state.is_world_process_zero = self.is_world_process_zero() # tr_loss is a tensor to avoid synchronization of TPUs through .item() tr_loss = torch.tensor(0.0).to(self.args.device) # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses self._total_loss_scalar = 0.0 self._globalstep_last_logged = 0 self._total_flos = self.state.total_flos model.zero_grad() self.control = self.callback_handler.on_train_begin(self.args, self.state, self.control) # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not self.args.ignore_data_skip: for epoch in range(epochs_trained): # We just need to begin an iteration to create the randomization of the sampler. for _ in train_dataloader: break for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None steps_in_epoch = len(train_dataloader) if train_dataset_is_sized else self.args.max_steps self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control) assert train_dataset_is_sized, "currently we only support sized dataloader!" inputs = None last_inputs = None for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if (step + 1) % self.args.gradient_accumulation_steps == 0: self.control = self.callback_handler.on_step_begin(self.args, self.state, self.control) if ((step + 1) % self.args.gradient_accumulation_steps != 0) and self.args.local_rank != -1: # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. with model.no_sync(): tr_loss += self.training_step(model, inputs) else: tr_loss += self.training_step(model, inputs) self._total_flos += self.floating_point_ops(inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps steps_in_epoch <= self.args.gradient_accumulation_steps and (step + 1) == steps_in_epoch ): # Gradient clipping if self.args.max_grad_norm is not None and self.args.max_grad_norm > 0 and not self.deepspeed: # deepspeed does its own clipping if self.use_amp: # AMP: gradients need unscaling self.scaler.unscale_(self.optimizer) if hasattr(self.optimizer, "clip_grad_norm"): # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping self.optimizer.clip_grad_norm(self.args.max_grad_norm) else: # Revert to normal clipping otherwise, handling Apex or full precision torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer) if self.use_apex else model.parameters(), self.args.max_grad_norm, ) # Optimizer step if is_torch_tpu_available(): xm.optimizer_step(self.optimizer) elif self.use_amp: self.scaler.step(self.optimizer) self.scaler.update() else: self.optimizer.step() self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 self.state.epoch = epoch + (step + 1) / steps_in_epoch self.control = self.callback_handler.on_step_end(self.args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch) if self.control.should_epoch_stop or self.control.should_training_stop: break self.control = self.callback_handler.on_epoch_end(self.args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch) if self.args.tpu_metrics_debug or self.args.debug: if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) else: logger.warning( "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) if self.control.should_training_stop: break if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None: logger.info( f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." ) if isinstance(self.model, PreTrainedModel): self.model = self.model.from_pretrained(self.state.best_model_checkpoint, model_args=self.model_args) if not self.is_model_parallel: self.model = self.model.to(self.args.device) else: state_dict = torch.load(os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)) self.model.load_state_dict(state_dict) if self.deepspeed: self.deepspeed.load_checkpoint( self.state.best_model_checkpoint, load_optimizer_states=False, load_lr_scheduler_states=False ) metrics = speed_metrics("train", start_time, self.state.max_steps) if self._total_flos is not None: self.store_flos() metrics["total_flos"] = self.state.total_flos self.log(metrics) self.control = self.callback_handler.on_train_end(self.args, self.state, self.control) # add remaining tr_loss self._total_loss_scalar += tr_loss.item() return TrainOutput(self.state.global_step, self._total_loss_scalar / self.state.global_step, metrics)
def is_local_master(self) -> bool: if is_torch_tpu_available(): return xm.is_master_ordinal(local=True) else: return self.args.local_rank in [-1, 0]
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() or not self.args.logging_tqdm ) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master() or not self.args.logging_tqdm) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master() or not self.args.logging_tqdm) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss self._log(logs) if (self.args.eval_steps > 0 and self.global_step % self.args.eval_steps == 0): if self.args.evaluate_during_training: self.evaluate() if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step)
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if not isinstance(dataloader.dataset, collections.abc.Sized): raise ValueError("dataset must implement __len__") prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size num_examples = self.num_examples(dataloader) logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", batch_size) losses_host: torch.Tensor = None preds_host: Union[torch.Tensor, List[torch.Tensor]] = None labels_host: Union[torch.Tensor, List[torch.Tensor]] = None world_size = 1 if is_torch_tpu_available(): world_size = xm.xrt_world_size() elif self.args.local_rank != -1: world_size = torch.distributed.get_world_size() world_size = max(1, world_size) eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) if not prediction_loss_only: preds_gatherer = DistributedTensorGatherer(world_size, num_examples) labels_gatherer = DistributedTensorGatherer(world_size, num_examples) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: self._past = None self.callback_handler.eval_dataloader = dataloader for step, inputs in enumerate(dataloader): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) if loss is not None: losses = loss.repeat(batch_size) losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) if logits is not None: # preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) logits_reduced = logits.argmax(-1) preds_host = logits_reduced if preds_host is None else nested_concat(preds_host, logits_reduced, padding_index=-100) if labels is not None: labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host = None, None, None if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize() if not prediction_loss_only else None if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if eval_loss is not None: metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)