def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") if torch.cuda.is_available(): for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() # Make sure Variable is on the cpu before converting to numpy. # .cpu() is a no-op if you aren't using GPUs. train_loss += loss.data.cpu().numpy() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) self._optimizer.step() # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) if hasattr(self, "_tf_params") and self._tf_params is not None: # We have TF logging if self._batch_num_total % self._tf_params["log_every"] == 0: self._tf_log(metrics, self._batch_num_total) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def monitor(min_memory: int, check_interval: int) -> List[Tuple[int]]: available_gpu = [] while (not available_gpu): for gpu, memory in gpu_memory_mb().items(): if memory < min_memory: available_gpu.append((gpu, memory)) sleep(check_interval) return available_gpu
def measure_cpu_gpu(self, trainer: "CallbackTrainer"): # This used to be in train_epoch() logger.info("Epoch %d/%d", trainer.epoch_number, trainer.num_epochs - 1) self.peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {self.peak_cpu_usage}") self.gpu_usage.clear() for gpu, memory in gpu_memory_mb().items(): self.gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}")
def get_metrics(self, reset: bool = False) -> Dict[str, float]: metrics = { 'accuracy': self.metrics['accuracy'].get_metric(reset=reset) } metrics.update({ 'average_precision': self.metrics['average_precision'].get_metric(reset=reset), 'f1': self.metrics['f1_score'].get_metric(reset=reset), 'auc': self.metrics['auc'].get_metric(reset=reset) }) for (gpu_num, memory) in gpu_memory_mb().items(): metrics.update( {'gpu_batch_' + str(gpu_num) + '_memory_MB': memory}) return metrics
def __call__( self, trainer: "GradientDescentTrainer", batch_inputs: List[List[TensorDict]], batch_outputs: List[Dict[str, Any]], epoch: int, batch_number: int, is_training: bool, is_master: bool, ) -> None: # In the distributed case we need to call this from every worker, since every # worker reports its own memory usage. cpu_memory_usage = common_util.peak_memory_mb() # But we only want to call `gpu_memory_mb` and `log_memory_usage` from the # master process. if is_master: gpu_memory_usage = common_util.gpu_memory_mb() trainer._tensorboard.log_memory_usage(cpu_memory_usage, gpu_memory_usage)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_reg_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches = math.ceil( len(self.data_loader) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs["reg_loss"] if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) reg_loss = reg_loss / len(batch_group) if self._opt_level is not None: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_reg_loss += reg_loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch(self.model, self.optimizer, batch_grad_norm, metrics, batch_group, param_updates) if self._master: self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() # Make sure Variable is on the cpu before converting to numpy. # .cpu() is a no-op if you aren't using GPUs. train_loss += loss.data.cpu().numpy() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().data.cpu().clone() for name, param in self._model.named_parameters()} self._optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().data.cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )) self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self._optimizer.step() # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if batch_num_total % self._summary_interval == 0: self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total) self._metrics_to_tensorboard(batch_num_total, {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._log_histograms_this_batch: self._histograms_to_tensorboard(batch_num_total, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False ) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def log_memory_usage(self): cpu_memory_usage = peak_memory_mb() self.add_train_scalar("memory_usage/cpu", cpu_memory_usage) for gpu, memory in gpu_memory_mb().items(): self.add_train_scalar(f"memory_usage/gpu_{gpu}", memory)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # 如果没有gpu ,也返回1. # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) residue = num_training_batches % self.accumulated_batch_count self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm( train_generator, total=num_training_batches) # 打印一个进度条而已. cumulative_batch_size = 0 self.optimizer.zero_grad() for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total iter_len = self.accumulated_batch_count \ if batches_this_epoch <= (num_training_batches - residue) else residue if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) try: loss = self.batch_loss( batch_group, for_training=True) / iter_len # 输入的数据里面去除了全部都是keep的情况 except RuntimeError as e: print(e) for x in batch_group: all_words = [len(y['words']) for y in x['metadata']] print(f"Total sents: {len(all_words)}. " f"Min {min(all_words)}. Max {max(all_words)}") for elem in ['labels', 'd_tags']: tt = x[elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) for elem in ["bert", "mask", "bert-offsets"]: tt = x['tokens'][elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) raise e if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) train_loss += loss.item() * iter_len del batch_group, loss torch.cuda.empty_cache() # 节省内存,显存 if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() #多个batch才进行bp算法. self.optimizer.zero_grad() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) # 计算准确率 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. 取一个间隔来存 if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() # Get tqdm for the training batches train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) num_training_batches = self.iterator.get_num_batches(self.train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 self.optimizer.zero_grad() for batch_id, batch in enumerate(train_generator_tqdm): batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if (batch_id + 1) % self._accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if (batch_id + 1) % self._accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = training_util.get_batch_size(batch) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() num_training_batches = [math.ceil( self.iterator.get_num_batches(train_data) / self._num_gradient_accumulation_steps ) for task, train_data in self.train_datas.items()] assert len(set(num_training_batches)) == 1, "num_training_batches doesn't agree" tasks = list(self.batch_group_generators.keys()) num_tasks = len(tasks) #if isinstance(self._learning_rate_scheduler, SlantedTriangular): # old_num_steps_per_epoch = self._learning_rate_scheduler.num_steps_per_epoch # self._learning_rate_scheduler.num_steps_per_epoch = num_training_batches[0] # logger.info(f"modify num_steps_per_epoch of lr scheduler from" # f"{old_num_steps_per_epoch} to {num_training_batches}") self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 logger.info("Training") cumulative_batch_group_size = 0 tqdm_bar = Tqdm.tqdm(range(num_training_batches[0])) for _ in tqdm_bar: randperms = torch.randperm(len(tasks)).tolist() sampled_tasks = [tasks[idx] for idx in randperms[:self._tasks_per_step]] sampled_task_generators = [next(self.batch_group_generators[task]) for task in sampled_tasks] batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() task_metrics = self.wrapper(tasks=sampled_task_generators, train=True, meta_train=True) losses = [list(map(lambda x: x["loss"], metrics)) for metrics in task_metrics] LASes = [list(map(lambda x: x["metric"]["LAS"], metrics)) for metrics in task_metrics] names = ["loss", "LAS"] list_values = [losses, LASes] if self.has_VIB: KLDivs = [list(map(lambda x: x["metric"]["kl_div"], metrics)) for metrics in task_metrics] names.append("KLDiv") list_values.append(KLDivs) if self.has_pos: pos_accs = [list(map(lambda x: x["metric"].get("pos_accuracy", 0.0), metrics)) for metrics in task_metrics] names.append("pos_acc") list_values.append(pos_accs) for name, values in zip(names, list_values): self._writer.log({f"step_{name}_{task}_{i}": value for task, task_values in zip(sampled_tasks, values) for i, value in enumerate(task_values)}, step=self._batch_num_total) values_inner_steps = list(map(np.mean, zip(*values))) self._writer.log({f"step_{name}_{i}": value for i, value in enumerate(values_inner_steps)}, step=self._batch_num_total) if name == "loss": train_loss += values_inner_steps[0] batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) # variational information bottleneck / meta-learning without memorization if self.has_VIB: kl_loss, kl_div, kl_div2 = ContinuousVIB.get_kl_loss(self.model, sampled_task_generators) kl_loss.backward() self._writer.log({"kl_loss": kl_loss.detach().item(), "kl_div": kl_div, "kl_div2": kl_div2}, step=self._batch_num_total) # adversarial training if self.task_D and self.optim_D: # D training self.optimizer.step() steps_per_update = self.task_D.steps_per_update if (batch_num_total - 1) % steps_per_update == 0: self.optim_D.zero_grad() hidden_states, labels, masks = self.task_D.get_hidden_states( self.model, sampled_task_generators ) D_loss, _, acc = self.task_D(hidden_states, labels, masks, detach=True) D_loss.backward() disc_grad_norm = training_util.rescale_gradients(self.task_D, self.task_D.disc_grad_norm) self.optim_D.step() self._writer.log({"D_loss": D_loss.detach().item(), "D_acc": acc}, step=self._batch_num_total) if disc_grad_norm: self._writer.log({"D_grad_norm": disc_grad_norm.detach().item()}, step=self._batch_num_total) # G training hidden_states, labels, masks = self.task_D.get_hidden_states( self.model, sampled_task_generators ) _, g_loss, acc = self.task_D(hidden_states, labels, masks) if self.task_D.weight: alpha = self.task_D.weight else: alpha = self.task_D.get_alpha(self._batch_num_total, num_training_batches[0] * self._num_epochs) G_loss = -alpha * g_loss G_loss.backward() gen_grad_norm = training_util.rescale_gradients(self.model, self.task_D.gen_grad_norm) self._writer.log({"G_loss": g_loss.detach().item(), "alpha": alpha, "G_acc": acc}, step=self._batch_num_total) if gen_grad_norm: self._writer.log({"G_grad_norm": gen_grad_norm.detach().item()}, step=self._batch_num_total) self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.wrapper.container, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) tqdm_bar.set_description(description, refresh=False) # log learning rate. self._writer.log({"lr": self.optimizer.param_groups[0]['lr']}, step=self._batch_num_total) # Save model if needed. if ( self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master ): last_save_time = time.time() self._save_checkpoint( "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time))) ) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.wrapper.container, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() #num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) #train_generator = lazy_groups_of(raw_train_generator, num_gpus) #num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) num_training_batches = 1 self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(raw_train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch, lr_mult in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() # batch_grad_norm = self.rescale_gradients() if self._grad_clipping: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self._grad_clipping) # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) # We dynamically adjust the learning rate to account for slight variations in the input # sequences original_lr = self.optimizer.param_groups[0]['lr'] batch_lr = original_lr * lr_mult self.optimizer.param_groups[0]['lr'] = batch_lr if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() self.optimizer.param_groups[0]['lr'] = original_lr # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): # self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains on one epoch. Differs from base trainer in that it utilizes """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) raw_generators = [] # fix max number of batches self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_size = 0 for i in range(0, self.meta_batches): train_generators = [] for i, train_info in enumerate(self.train_data): raw_train_generator = self.iterator(train_info, num_epochs=1, shuffle=self.shuffle) train_generators.append( lazy_groups_of(raw_train_generator, num_gpus)) loss_batch = self.reptile_outer_update(train_generators, i, num_gpus) # TODO figure out if is important train_loss = loss_batch print('[info] train_loss is:{}'.format(train_loss)) # TODO figure out BATCH NORM MAML https://openreview.net/pdf?id=HygBZnRctX if self.batch_norm: batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. # TODO investigate learning rate scheduling for meta learning #if self._learning_rate_scheduler: #self._learning_rate_scheduler.step_batch(batch_num_total) #if self._momentum_scheduler: #self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches # 使训练数据可迭代 raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) # 将可迭代的单实例批处理到list中 train_generator = lazy_groups_of(raw_train_generator, num_gpus) # 向上取整 获取batch数 (总batch/gpu数) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) # 默认的accumulated batch count 为4,此处是求accumulate的尾巴 residue = num_training_batches % self.accumulated_batch_count self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") # 训练进度条 train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 # 梯度清零 常规操作 self.optimizer.zero_grad() # 开始训练 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # 一个batch为accumulated_batch_count个iteration,梯度累积 iter_len = self.accumulated_batch_count \ if batches_this_epoch <= (num_training_batches - residue) else residue if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) try: # 平均loss loss = self.batch_loss(batch_group, for_training=True) / iter_len except RuntimeError as e: print(e) for x in batch_group: all_words = [len(y['words']) for y in x['metadata']] print(f"Total sents: {len(all_words)}. " f"Min {min(all_words)}. Max {max(all_words)}") for elem in ['labels', 'd_tags']: tt = x[elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) for elem in ["bert", "mask", "bert-offsets"]: tt = x['tokens'][elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) raise e if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) if torch.isnan(loss): raise ValueError("nan loss encountered") # 反向传播 loss.backward() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) # 计算loss train_loss += loss.item() * iter_len # 删除两个变量 del batch_group, loss # pytorch 训练时无用的临时变量可能会越来越多,导致 out of memory ,可以使用下面语句来清理这些不需要的变量。 torch.cuda.empty_cache() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) # 正则化梯度 batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. # lr会在epoch变大的同时予以调整,一般是逐渐变小 # momentum 动量 防止损失函数陷入局部极小值,跳出鞍点 if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # copy参数 防止爆内存 # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: # 自动计算梯度 optimizer.step() self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) # 求l1范数 update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() self.optimizer.zero_grad() # Update moving averages 在adam或SGD优化中为了平衡模型更新速度一般设置滑动平均来提高模型在测试数据上的健壮性 if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() train_loss += loss.item() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self._model.named_parameters()} self._optimizer.step() for name, param in self._model.named_parameters():
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps ) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() self._writer.log({"step_loss": loss.item()}, step=self._batch_num_total) train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._writer.log({"lr": self.optimizer.param_groups[0]['lr']}, step=self._batch_num_total) # Save model if needed. if ( self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master ): last_save_time = time.time() self._save_checkpoint( "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time))) ) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 ############ # Training # ############ logger.info("Training") for batch in train_generator: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() ######## # loss # ######## loss = self._batch_loss(batch, for_training=True) loss.backward() # Make sure Variable is on the cpu before converting to numpy. # .cpu() is a no-op if you aren't using GPUs. train_loss += loss.data.cpu().numpy() ######################## # Update Learning Rate # ######################## self._update_learning_rate(None, batch_num_total=batch_num_total) ################# # Update Params # ################# if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().data.cpu().clone() for name, param in self._model.named_parameters() } self._optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().data.cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )) self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self._optimizer.step() ################# # Print Metrics # ################# # Update the description with the latest metrics if batches_this_epoch % 10 == 0: metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) sys.stdout.write("At %d-th batch: %s\n" % (batches_this_epoch, description)) sys.stdout.flush() ############## # Save model # ############## # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, time_to_str(int(last_save_time))), [], is_best=False) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info(f"Epoch: {epoch}/{self._num_epochs - 1}") cpu_memory_usage = [] for worker, memory in common_util.peak_memory_mb().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage MB: {memory}") gpu_memory_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") for component_optimizer in self.component_optimizers.values(): component_optimizer.reset_loss('train') self.model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps) except TypeError: num_training_batches = float("inf") batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total for component_optimizer in self.component_optimizers.values(): component_optimizer.zero_grad() batch_group_metrics = [] meta_batch = deepcopy(batch_group) # Train the Sub Models first for name, sub_model in self._pytorch_model.component_models.items( ): component_optimizer = self.component_optimizers[name] batch_group_outputs, metrics = component_optimizer.process_batch_group( batch_group, True, batch_num_total, batches_this_epoch, True) batch_group_metrics.append(metrics) for i, batch_outputs in enumerate(batch_group_outputs): component_output = batch_outputs["output"] component_output = component_output.detach() meta_batch[i][name] = component_output meta_optimizer = self.component_optimizers["meta"] meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group( meta_batch, True, batch_num_total, batches_this_epoch, False) # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) batch_group_metrics.append(meta_metrics) all_metrics = ChainMap(*batch_group_metrics) description = training_util.description_from_metrics(all_metrics) batch_group_generator_tqdm.set_description(description, refresh=False) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return all_metrics
def semi_train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.trainer.model.train() num_gpus = len(self.trainer._cuda_devices) self.trainer._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self.trainer._batch_num_total is None: self.trainer._batch_num_total = 0 histogram_parameters = set( self.trainer.model. get_parameters_for_histogram_tensorboard_logging()) #Pdb().set_trace() mixed_generator, num_training_batches = get_mixer( self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled) #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer) #generator for lambda update mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm', 1.0) #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm') logger.info("Training") train_generator_tqdm = Tqdm.tqdm(mixed_generator, total=num_training_batches) #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator), # total=num_training_batches) cumulative_batch_size = 0 unlabelled_loss = 0 unlabelled_batches_this_epoch = 0 batches_since_last_step = 0 agg_loss = 0.0 flag = False batch_grad_norm = None for batch_group, group_id in train_generator_tqdm: #print(batch_group[0]['sentence']['tokens'].shape) if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id: continue output_dict = self.batch_loss( batch_group, for_training=True, eval_metric=(group_id == self.labelled_id)) penalties = defaultdict(float) if self.constraints_model is not None: penalties = self.constraints_model( output_dict['task1_tag_logits'], output_dict['task2_tag_logits'], output_dict['mask']) loss = 0.0 if 'loss' in output_dict: loss = output_dict['loss'] train_loss += loss.item() loss += output_dict.get('regularization_penalty', 0.0) loss += self.constraints_wt * penalties['loss'] unlabelled_loss += penalties['loss'].item() if torch.is_tensor( penalties['loss']) else penalties['loss'] agg_loss += loss batches_since_last_step += 1 if batches_since_last_step == self.backprop_after_xbatches: #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False else: flag = True #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) if (group_id != self.labelled_id): unlabelled_batches_this_epoch += 1 #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() #self.trainer.optimizer.step() else: self.total_supervised_iters += 1.0 batches_this_epoch += 1 self.trainer._batch_num_total += 1 batch_num_total = self.trainer._batch_num_total #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self.trainer._learning_rate_scheduler: self.trainer._learning_rate_scheduler.step_batch( batch_num_total) if self.trainer._tensorboard.should_log_histograms_this_batch( ): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.trainer.model.named_parameters() } #self.trainer.optimizer.step() for name, param in self.trainer.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self.trainer._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: pass #self.trainer.optimizer.step() # Update moving averages if self.trainer._moving_average is not None: self.trainer._moving_average.apply(batch_num_total) # metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch) metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) # Update the description with the latest metrics description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self.trainer._tensorboard.should_log_this_batch( ) and batch_grad_norm is not None: self.trainer._tensorboard.log_parameter_and_gradient_statistics( self.trainer.model, batch_grad_norm) self.trainer._tensorboard.log_learning_rates( self.trainer.model, self.trainer.optimizer) self.trainer._tensorboard.add_train_scalar( "loss/loss_train", metrics["loss"]) self.trainer._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.trainer._tensorboard.should_log_histograms_this_batch( ): self.trainer._tensorboard.log_histograms( self.trainer.model, histogram_parameters) if self.trainer._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self.trainer._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self.trainer._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self.trainer._tensorboard.add_train_scalar( "mean_batch_size", average) # Save model if needed. if self.trainer._model_save_interval is not None and ( time.time() - last_save_time > self.trainer._model_save_interval): last_save_time = time.time() self.trainer._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters ) and (self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by if flag: batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters) and ( self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage metrics['lb'] = batches_this_epoch metrics['ub'] = unlabelled_batches_this_epoch metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) if self.constraints_model is not None: lambda_stats_dict = self.constraints_model.lambda_stats() metrics.update(lambda_stats_dict) for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_loss_lang1 = 0.0 train_loss_lang2 = 0.0 train_loss_cm = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() self.optimizer_lang1.zero_grad() self.optimizer_lang2.zero_grad() self.optimizer_cm.zero_grad() loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss( batch_group, for_training=True) if torch.isnan(loss): # if either on of loss_%s is nan, loss will be nan raise ValueError("nan loss encountered") ####### # lang1 ####### loss_lang1.backward() train_loss_lang1 += loss_lang1.item() self.rescale_gradients() if self._learning_rate_scheduler_lang1: self._learning_rate_scheduler_lang1.step_batch(batch_num_total) if self._momentum_scheduler_lang1: self._momentum_scheduler_lang1.step_batch(batch_num_total) self.optimizer_lang1.step() self.optimizer_lang1.zero_grad() ####### # cm ####### loss_lang2.backward() train_loss_lang2 += loss_lang2.item() batch_grad_norm = self.rescale_gradients() if self._learning_rate_scheduler_lang2: self._learning_rate_scheduler_lang2.step_batch(batch_num_total) if self._momentum_scheduler_lang2: self._momentum_scheduler_lang2.step_batch(batch_num_total) self.optimizer_lang2.step() self.optimizer_lang2.zero_grad() ####### # lang2 ####### loss_cm.backward() train_loss_cm += loss_cm.item() self.rescale_gradients() if self._learning_rate_scheduler_cm: self._learning_rate_scheduler_cm.step_batch(batch_num_total) if self._momentum_scheduler_cm: self._momentum_scheduler_cm.step_batch(batch_num_total) self.optimizer_cm.step() self.optimizer_cm.zero_grad() train_loss += loss.item() # Update the description with the latest metrics # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) metrics = self.model.get_metrics(False) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang1) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang2) self._tensorboard.log_learning_rates(self.model, self.optimizer_cm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.add_train_scalar("loss/cm_loss_train", metrics["cm_loss"]) self._tensorboard.add_train_scalar("loss/lang1_loss_train", metrics["lang1_loss"]) self._tensorboard.add_train_scalar("loss/lang2_loss_train", metrics["lang2_loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics = self.model.get_metrics(reset=True) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def train(self, recover: bool = False) -> Dict[str, Any]: # 1 train sentiment classifier & private classifier & domain embeddings => init G 50 epoch # 2 fix share encoder(+domain embeddings?), train share classifier(cls&real/fake) & others => train D # 3 fix share classifier, train share encoder, reverse share classifier input gradient min loss => train G training_start_time = time.time() if recover: try: n_epoch, should_stop = self._restore_checkpoint() logger.info("Loaded model from checkpoint. Starting at epoch {}", n_epoch) except RuntimeError: raise ConfigurationError( "Could not recover training from the checkpoint. Did you mean to output to " "a different serialization directory or delete the existing serialization " "directory?" ) else: n_epoch, should_stop = 0, False ### Store all the necessary informations and attributes about the tasks ### task_infos = {task._name: {} for task in self._task_list} for task_idx, task in enumerate(self._task_list): task_info = task_infos[task._name] # Store statistiscs on training and validation batches data_iterator = task._data_iterator n_tr_batches = data_iterator.get_num_batches(task._train_data) n_val_batches = data_iterator.get_num_batches(task._validation_data) task_info["n_tr_batches"] = n_tr_batches task_info["n_val_batches"] = n_val_batches # Create counter for number of batches trained during the whole # training for this specific tasks task_info["total_n_batches_trained"] = 0 task_info["last_log"] = time.time() # Time of last logging self._task_infos = task_infos ### Bookkeeping the validation metrics ### metric_infos = { task._name: { "val_metric": task._val_metric, "hist": [], "is_out_of_patience": False, "min_lr_hit": False, "best": (-1, {}), } for task in self._task_list } self._metric_infos = metric_infos ### Write log ### total_n_tr_batches = 0 # The total number of training batches across all the datasets. for task_name, info in self._task_infos.items(): total_n_tr_batches += info["n_tr_batches"] logger.info("Task {}:", task_name) logger.info("\t{} training batches", info["n_tr_batches"]) logger.info("\t{} validation batches", info["n_val_batches"]) ### Create the training generators/iterators tqdm ### self._tr_generators = {} for task in self._task_list: data_iterator = task._data_iterator tr_generator = data_iterator(task._train_data, num_epochs=None) self._tr_generators[task._name] = tr_generator ### Create sampling probability distribution ### if self._sampling_method == "uniform": sampling_prob = [float(1 / self._n_tasks)] * self._n_tasks elif self._sampling_method == "proportional": sampling_prob = [float(info["n_tr_batches"] / total_n_tr_batches) for info in self._task_infos.values()] ### Enable gradient clipping ### # Only if self._grad_clipping is specified self._enable_gradient_clipping() ### Setup is ready. Training of the model can begin ### logger.info("Set up ready. Beginning training/validation.") avg_accuracies = [] best_accuracy = 0.0 ### Begin Training of the model ### while not should_stop: ### Log Infos: current epoch count and CPU/GPU usage ### logger.info("") logger.info("Epoch {}/{} - Begin", n_epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") # if n_epoch <= 10: # # init generator # all_tr_metrics = self._train_epoch(total_n_tr_batches, sampling_prob) # # train discriminator 3 epochs # # elif 10 < n_epoch < 20 or n_epoch % 2 == 0: # # all_tr_metrics = self._train_epoch(total_n_tr_batches, sampling_prob, train_D=True) # else: # train adversarial generator every 3 epoch all_tr_metrics = self._train_epoch(total_n_tr_batches, sampling_prob, reverse=True) all_val_metrics, avg_accuracy = self._validation(n_epoch) is_best = False if best_accuracy < avg_accuracy: best_accuracy = avg_accuracy logger.info("Best accuracy found --- {}", best_accuracy / self._n_tasks) is_best = True ### Print all training and validation metrics for this epoch ### logger.info("***** Epoch {}/{} Statistics *****", n_epoch, self._num_epochs - 1) for task in self._task_list: logger.info("Statistic: {}", task._name) logger.info( "\tTraining - {}: {:3d}", "Nb batches trained", self._task_infos[task._name]["n_batches_trained_this_epoch"], ) for metric_name, value in all_tr_metrics[task._name].items(): logger.info("\tTraining - {}: {:.3f}", metric_name, value) for metric_name, value in all_val_metrics[task._name].items(): logger.info("\tValidation - {}: {:.3f}", metric_name, value) logger.info("***** Average accuracy is {:.6f} *****", avg_accuracy / self._n_tasks) avg_accuracies.append(avg_accuracy / self._n_tasks) logger.info("**********") ### Check to see if should stop ### stop_tr, stop_val = True, True for task in self._task_list: # task_info = self._task_infos[tasks._name] if self._optimizers[task._name]['exclude_share_encoder'].param_groups[0]["lr"] < self._min_lr and \ self._optimizers[task._name]['exclude_share_discriminator'].param_groups[0][ "lr"] < self._min_lr: logger.info("Minimum lr hit on {}.", task._name) logger.info("Task {} vote to stop training.", task._name) metric_infos[task._name]["min_lr_hit"] = True stop_tr = stop_tr and self._metric_infos[task._name]["min_lr_hit"] stop_val = stop_val and self._metric_infos[task._name]["is_out_of_patience"] if stop_tr: should_stop = True logger.info("All tasks hit minimum lr. Stopping training.") if stop_val: should_stop = True logger.info("All metrics ran out of patience. Stopping training.") if n_epoch >= self._num_epochs - 1: should_stop = True logger.info("Maximum number of epoch hit. Stopping training.") self._save_checkpoint(n_epoch, should_stop, is_best) ### Update n_epoch ### # One epoch = doing N (forward + backward) pass where N is the total number of training batches. n_epoch += 1 self._epoch_trained = n_epoch logger.info("Max accuracy is {:.6f}", max(avg_accuracies)) ### Summarize training at the end ### logger.info("***** Training is finished *****") logger.info("Stopped training after {} epochs", n_epoch) return_metrics = {} for task_name, task_info in self._task_infos.items(): nb_epoch_trained = int(task_info["total_n_batches_trained"] / task_info["n_tr_batches"]) logger.info( "Trained {} for {} batches ~= {} epochs", task_name, task_info["total_n_batches_trained"], nb_epoch_trained, ) return_metrics[task_name] = { "best_epoch": self._metric_infos[task_name]["best"][0], "nb_epoch_trained": nb_epoch_trained, "best_epoch_val_metrics": self._metric_infos[task_name]["best"][1], } training_elapsed_time = time.time() - training_start_time return_metrics["training_duration"] = time.strftime("%d:%H:%M:%S", time.gmtime(training_elapsed_time)) return_metrics["nb_epoch_trained"] = n_epoch return return_metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 out_of_memory_count = 0 # Set the model to "train" mode. self.model.train() # Get tqdm for the training batches self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") num_training_batches = len(self.train_loader) #TODO: other options for process bar #TODO: subset train_generator_tqdm = Tqdm.tqdm(self.train_loader, total=num_training_batches) for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self.optimizer.zero_grad() try: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") if self.fp16: self.optimizer.backward(loss) else: loss.backward() except RuntimeError as e: if 'out of memory' in str(e): torch.cuda.empty_cache() out_of_memory_count += 1 if out_of_memory_count > int(num_training_batches * 0.01): raise e else: raise e train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. #if self._learning_rate_scheduler: # self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self.optimizer.step() if self.ema is not None: for name, param in self.model.named_parameters(): if param.requires_grad: param.data = self.ema(name, param.data) # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) #TODO: other options for process bar train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if batch_num_total % self._summary_interval == 0: if self._should_log_parameter_statistics: self._parameter_and_gradient_statistics_to_tensorboard( batch_num_total, batch_grad_norm) if self._should_log_learning_rate: self._learning_rates_to_tensorboard(batch_num_total) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total) self._metrics_to_tensorboard( batch_num_total, {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.predictor is not None: with torch.no_grad(): val_metrics = self.predictor.evaluate(self.model) self._metrics_to_tensorboard( batch_num_total, { "interval_metrics/" + k: v for k, v in val_metrics.items() }) this_interval_val_metric = val_metrics[ self._validation_metric] is_best_so_far = self._is_best_so_far( this_interval_val_metric, self._validation_metric_per_interval) self._validation_metric_per_interval.append( this_interval_val_metric) if is_best_so_far: self._save_checkpoint( '{0}.{1}'.format(epoch, batch_num_total), self._validation_metric_per_interval, is_best=True) if self._log_histograms_this_batch: self._histograms_to_tensorboard(batch_num_total, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, time_to_str(int(last_save_time))), [], is_best=False) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() images = [] text = [] segment_ids = [] labels = [] for i in range(len(batch_group[0]['images'])): positive_index = random.randint(0, self.num_negative_samples) labels.append(positive_index) if self.retrieve_text: instance_text = [] instance_segment_ids = [] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_text.append(batch_group[0]['token_ids'] ['tokens'][i, :].tolist()) instance_segment_ids.append( batch_group[0]['segment_ids'][i].tolist()) else: negative_sample_index = random.choice( self.train_indices) text_field = TextField( self.train_text_db[negative_sample_index], self.train_token_indexers) text_field.index(self.model.vocab) padding_lengths = text_field.get_padding_lengths() instance_text.append( text_field.as_tensor( padding_lengths=padding_lengths) ['tokens'].tolist()) instance_segment_ids.append( self.train_segment_ids_db[ negative_sample_index].tolist()) text += instance_text segment_ids += instance_segment_ids else: instance_images = [ None for _ in range(self.num_negative_samples + 1) ] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_images[j] = np.expand_dims( batch_group[0]['images'][i].numpy(), 0) else: instance_images[j] = np.expand_dims( random.choice(self.train_image_db), 0) images += instance_images matching_label_field_name = "labels" if self.retrieve_text: max_text_len = max([len(sequence) for sequence in text]) text = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in text ] batch_group[0]['token_ids'] = { 'tokens': torch.LongTensor(text) } segment_ids = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in segment_ids ] batch_group[0]['segment_ids'] = torch.from_numpy( np.array(segment_ids, dtype=np.int64)) else: batch_group[0]['images'] = torch.from_numpy(np.vstack(images)) batch_group[0][matching_label_field_name] = torch.from_numpy( np.array(labels, dtype=np.int64)) loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def train(self, recover: bool = False): """ Train the different task_list, save the different checkpoints and metrics, and save the model at the end of training while logging the training details. The metrics through the training are stored in dictionaries with the following structure: all_metrics - Dict[str, str] task_name: val_metric metric_infos (Dict[]) task_name (Dict[str, diverse] val_metric (str): name (str) hist (str): history_of_the_val_metric (List[float]) stopped (str): training_is_stopped (bool) best (str): best_epoch_for_val_metric (Tuple(int, Dict)) all_tr_metrics (Dict[str, Dict[str, float]]) task_name (Dict[str, float]) metric_name (str): value (float) loss: value (float) all_val_metrics (Dict[str, Dict[str, float]]) task_name (Dict[str, float]) metric_name (str): value (float) loss (str): value (float) Parameters ---------- task_list: List[Task], required A list containing the tasks to train. params: Params, required Training parameters recover: bool, required Whether or not training should be recovered from a previous training. Returns ------- return_dict: Dict A dictionary summarizing the training and the metrics for the best epochs for each task. """ training_start_time = time.time() if recover: try: n_epoch, should_stop = self._restore_checkpoint() logger.info( "Loaded model from checkpoint. Starting at epoch %d", n_epoch) except RuntimeError: raise ConfigurationError( "Could not recover training from the checkpoint. Did you mean to output to " "a different serialization directory or delete the existing serialization " "directory?") else: n_epoch, should_stop = 0, False ### Store all the necessary informations and attributes about the tasks ### task_infos = {task._name: {} for task in self._task_list} for task_idx, task in enumerate(self._task_list): task_info = task_infos[task._name] # Store statistiscs on training and validation batches data_iterator = task._data_iterator n_tr_batches = data_iterator.get_num_batches(task._train_data) n_val_batches = data_iterator.get_num_batches( task._validation_data) task_info["n_tr_batches"] = n_tr_batches task_info["n_val_batches"] = n_val_batches # Create counter for number of batches trained during the whole # training for this specific task task_info["total_n_batches_trained"] = 0 task_info["last_log"] = time.time() # Time of last logging self._task_infos = task_infos ### Bookkeeping the validation metrics ### metric_infos = { task._name: { "val_metric": task._val_metric, "hist": [], "is_out_of_patience": False, "min_lr_hit": False, "best": (-1, {}), } for task in self._task_list } self._metric_infos = metric_infos ### Write log ### total_n_tr_batches = 0 # The total number of training batches across all the datasets. for task_name, info in self._task_infos.items(): total_n_tr_batches += info["n_tr_batches"] logger.info("Task %s:", task_name) logger.info("\t%d training batches", info["n_tr_batches"]) logger.info("\t%d validation batches", info["n_val_batches"]) ### Create the training generators/iterators tqdm ### self._tr_generators = {} for task in self._task_list: data_iterator = task._data_iterator tr_generator = data_iterator(task._train_data, num_epochs=None) self._tr_generators[task._name] = tr_generator ### Create sampling probability distribution ### if self._sampling_method == "uniform": sampling_prob = [float(1 / self._n_tasks)] * self._n_tasks elif self._sampling_method == "proportional": sampling_prob = [ float(info["n_tr_batches"] / total_n_tr_batches) for info in self._task_infos.values() ] ### Enable gradient clipping ### # Only if self._grad_clipping is specified self._enable_gradient_clipping() ### Setup is ready. Training of the model can begin ### logger.info("Set up ready. Beginning training/validation.") ### Begin Training of the model ### while not should_stop: # Train one epoch (training pass + validation pass) self._model.train() # Set the model to "train" mode. ### Log Infos: current epoch count and CPU/GPU usage ### logger.info("") logger.info("Epoch %d/%d - Begin", n_epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") logger.info("Training - Begin") ### Reset training and trained batches counter before new training epoch ### for _, task_info in self._task_infos.items(): task_info["tr_loss_cum"] = 0.0 task_info["n_batches_trained_this_epoch"] = 0 all_tr_metrics = { } # BUG TO COMPLETE COMMENT TO MAKE IT MORE CLEAR ### Start training epoch ### epoch_tqdm = tqdm.tqdm(range(total_n_tr_batches), total=total_n_tr_batches) for _ in epoch_tqdm: task_idx = np.argmax(np.random.multinomial(1, sampling_prob)) task = self._task_list[task_idx] task_info = self._task_infos[task._name] ### One forward + backward pass ### # Call next batch to train batch = next(self._tr_generators[task._name]) task_info["n_batches_trained_this_epoch"] += 1 # Load optimizer optimizer = self._optimizers[task._name] optimizer.zero_grad() # Get the loss for this batch output_dict = self._forward(tensor_batch=batch, task=task, for_training=True) assert "loss" in output_dict, "Model must return a dict containing a 'loss' key" loss = output_dict["loss"] loss.backward() task_info["tr_loss_cum"] += loss.item() # Gradient rescaling if self._grad_norm is specified self._rescale_gradients() # Take an optimization step optimizer.step() ### Get metrics for all progress so far, update tqdm, display description ### task_metrics = self._get_metrics(task=task) task_metrics["loss"] = float( task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001)) description = self._description_from_metrics(task_metrics) epoch_tqdm.set_description(task._name + ", " + description) ### Tensorboard logging: Training detailled metrics, parameters and gradients ### if self._global_step % self._summary_interval == 0: # Metrics for metric_name, value in task_metrics.items(): self._tensorboard.add_train_scalar( name="training_details/" + task._name + "/" + metric_name, value=value, global_step=self._global_step, ) # Parameters and Gradients for param_name, param in self._model.named_parameters(): if self._log_parameter_statistics: self._tensorboard.add_train_scalar( name="parameter_mean/" + param_name, value=param.data.mean(), global_step=self._global_step, ) self._tensorboard.add_train_scalar( name="parameter_std/" + param_name, value=param.data.std(), global_step=self._global_step, ) if param.grad is None: continue if self._log_gradient_statistics: self._tensorboard.add_train_scalar( name="grad_mean/" + param_name, value=param.grad.data.mean(), global_step=self._global_step, ) self._tensorboard.add_train_scalar( name="grad_std/" + param_name, value=param.grad.data.std(), global_step=self._global_step, ) self._global_step += 1 ### Bookkeeping all the training metrics for all the tasks on the training epoch that just finished ### for task in self._task_list: task_info = self._task_infos[task._name] task_info["total_n_batches_trained"] += task_info[ "n_batches_trained_this_epoch"] task_info["last_log"] = time.time() task_metrics = self._get_metrics(task=task, reset=True) if task._name not in all_tr_metrics: all_tr_metrics[task._name] = {} for name, value in task_metrics.items(): all_tr_metrics[task._name][name] = value all_tr_metrics[task._name]["loss"] = float( task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01)) # Tensorboard - Training metrics for this epoch self._tensorboard.add_train_scalar( name="training_proportions/" + task._name, value=task_info["n_batches_trained_this_epoch"], global_step=n_epoch, ) for metric_name, value in all_tr_metrics[task._name].items(): self._tensorboard.add_train_scalar( name="task_" + task._name + "/" + metric_name, value=value, global_step=n_epoch) logger.info("Train - End") ### Begin validation of the model ### logger.info("Validation - Begin") all_val_metrics = {} self._model.eval() # Set the model into evaluation mode for task_idx, task in enumerate(self._task_list): logger.info("Validation - Task %d/%d: %s", task_idx + 1, self._n_tasks, task._name) val_loss = 0.0 n_batches_val_this_epoch_this_task = 0 n_val_batches = self._task_infos[task._name]["n_val_batches"] scheduler = self._schedulers[task._name] # Create tqdm generator for current task's validation data_iterator = task._data_iterator val_generator = data_iterator(task._validation_data, num_epochs=1, shuffle=False) val_generator_tqdm = tqdm.tqdm(val_generator, total=n_val_batches) # Iterate over each validation batch for this task for batch in val_generator_tqdm: n_batches_val_this_epoch_this_task += 1 # Get the loss val_output_dict = self._forward(batch, task=task, for_training=False) loss = val_output_dict["loss"] val_loss += loss.item() # Get metrics for all progress so far, update tqdm, display description task_metrics = self._get_metrics(task=task) task_metrics["loss"] = float( val_loss / n_batches_val_this_epoch_this_task) description = self._description_from_metrics(task_metrics) val_generator_tqdm.set_description(description) # Get task validation metrics and store them in all_val_metrics task_metrics = self._get_metrics(task=task, reset=True) if task._name not in all_val_metrics: all_val_metrics[task._name] = {} for name, value in task_metrics.items(): all_val_metrics[task._name][name] = value all_val_metrics[task._name]["loss"] = float( val_loss / n_batches_val_this_epoch_this_task) # Tensorboard - Validation metrics for this epoch for metric_name, value in all_val_metrics[task._name].items(): self._tensorboard.add_validation_scalar( name="task_" + task._name + "/" + metric_name, value=value, global_step=n_epoch) ### Perform a patience check and update the history of validation metric for this task ### this_epoch_val_metric = all_val_metrics[task._name][ task._val_metric] metric_history = self._metric_infos[task._name]["hist"] metric_history.append(this_epoch_val_metric) is_best_so_far, out_of_patience = self._check_history( metric_history=metric_history, cur_score=this_epoch_val_metric, should_decrease=task._val_metric_decreases, ) if is_best_so_far: logger.info("Best model found for %s.", task._name) self._metric_infos[task._name]["best"] = (n_epoch, all_val_metrics) if out_of_patience and not self._metric_infos[ task._name]["is_out_of_patience"]: self._metric_infos[task._name]["is_out_of_patience"] = True logger.info( "Task %s is out of patience and vote to stop the training.", task._name) # The LRScheduler API is agnostic to whether your schedule requires a validation metric - # if it doesn't, the validation metric passed here is ignored. scheduler.step(this_epoch_val_metric, n_epoch) logger.info("Validation - End") ### Print all training and validation metrics for this epoch ### logger.info("***** Epoch %d/%d Statistics *****", n_epoch, self._num_epochs - 1) for task in self._task_list: logger.info("Statistic: %s", task._name) logger.info( "\tTraining - %s: %3d", "Nb batches trained", self._task_infos[task._name] ["n_batches_trained_this_epoch"], ) for metric_name, value in all_tr_metrics[task._name].items(): logger.info("\tTraining - %s: %3f", metric_name, value) for metric_name, value in all_val_metrics[task._name].items(): logger.info("\tValidation - %s: %3f", metric_name, value) logger.info("**********") ### Check to see if should stop ### stop_tr, stop_val = True, True for task in self._task_list: # task_info = self._task_infos[task._name] if self._optimizers[ task._name].param_groups[0]["lr"] < self._min_lr: logger.info("Minimum lr hit on %s.", task._name) logger.info("Task %s vote to stop training.", task._name) metric_infos[task._name]["min_lr_hit"] = True stop_tr = stop_tr and self._metric_infos[ task._name]["min_lr_hit"] stop_val = stop_val and self._metric_infos[ task._name]["is_out_of_patience"] if stop_tr: should_stop = True logging.info("All tasks hit minimum lr. Stopping training.") if stop_val: should_stop = True logging.info( "All metrics ran out of patience. Stopping training.") if n_epoch >= self._num_epochs - 1: should_stop = True logging.info("Maximum number of epoch hit. Stopping training.") self._save_checkpoint(n_epoch, should_stop) ### Update n_epoch ### # One epoch = doing N (forward + backward) pass where N is the total number of training batches. n_epoch += 1 ### Summarize training at the end ### logging.info("***** Training is finished *****") logging.info("Stopped training after %d epochs", n_epoch) return_metrics = {} for task_name, task_info in self._task_infos.items(): nb_epoch_trained = int(task_info["total_n_batches_trained"] / task_info["n_tr_batches"]) logging.info( "Trained %s for %d batches ~= %d epochs", task_name, task_info["total_n_batches_trained"], nb_epoch_trained, ) return_metrics[task_name] = { "best_epoch": self._metric_infos[task_name]["best"][0], "nb_epoch_trained": nb_epoch_trained, "best_epoch_val_metrics": self._metric_infos[task_name]["best"][1], } training_elapsed_time = time.time() - training_start_time return_metrics["training_duration"] = time.strftime( "%d:%H:%M:%S", time.gmtime(training_elapsed_time)) return_metrics["nb_epoch_trained"] = n_epoch return return_metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() train_loss += loss.item() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self._model.named_parameters()} self._optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self._optimizer.step() # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if batch_num_total % self._summary_interval == 0: self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total) self._metrics_to_tensorboard(batch_num_total, {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._log_histograms_this_batch: self._histograms_to_tensorboard(batch_num_total, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False ) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_memory_mb().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage MB: {memory}") gpu_memory_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 if regularization_penalty is not None: train_reg_loss = 0.0 batch_reg_loss = 0.0 else: train_reg_loss = None batch_reg_loss = None # Set the model to "train" mode. self.model_engine.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown batch_group_generator_tqdm = batch_group_generator if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss = loss.item() train_loss += batch_loss if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss self.model_engine.backward(loss) self.model_engine.step() param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if self._master: # Updating tqdm only for the master as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch( self.model, self.optimizer, 0., # batch_grad_norm, metrics, batch_group, param_updates, ) self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, is_master=self._master, ) metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: self.model.train() batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self.model.named_parameters()} self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size/batches_this_epoch logger.info(f"current batch size: {cur_batch} mean batch size: {average}") self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time))) ) if self._early_stopping_by_batch and self._batch_num_total % 10 == 0: if self._validation_data is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, num_batches = self._validation_loss() val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True) # Check validation metric for early stopping this_epoch_val_metric = val_metrics[self._validation_metric] self._metric_tracker.add_metric(this_epoch_val_metric) if self._metric_tracker.is_best_so_far(): metrics['best_batch'] = self._batch_num_total for key, value in val_metrics.items(): metrics["best_validation_" + key] = value self._metric_tracker.best_epoch_metrics = val_metrics self._save_checkpoint(self._batch_num_total) if self.callbacks is not None: for callback in self.callbacks: callback.on_batch_end(self._batch_num_total) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of( raw_train_generator, num_gpus * self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / (num_gpus * self._num_gradient_accumulation_steps)) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: if not self._graph_added and self._require_graph: model_copy = deepcopy(self.model) model_copy.log_graph() wrapped_model = ModelWrapper(model_copy) graph_inputs = wrapped_model.process_inputs(batch_group[0]) # print(deepcopy(wrapped_model)(graph_inputs)) self._tensorboard.add_graph(wrapped_model, [graph_inputs]) self._graph_added = True batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() num_batch = len(batch_group) // num_gpus for i in range(num_batch): if (i + 1) * num_gpus > len(batch_group): batch_i = batch_group[i * num_gpus:] else: batch_i = batch_group[i * num_gpus:(i + 1) * num_gpus] loss = self.batch_loss(batch_i, for_training=True) if loss is None or torch.isnan(loss): print("nan loss") continue # raise ValueError("nan loss encountered") loss = loss / num_batch # try: # loss.backward() # except Exception: # print("loss: ", loss) # print(batch_group) # with torch.autograd.set_detect_anomaly(True): This can potentially lead to slower training # loss.backward() # loss = loss.half() loss.backward() # with amp.scale_loss(loss, self.optimizer) as scaled_loss: # try: # scaled_loss.backward() # except RuntimeError: # print("CUDA out of memory during backward()") # continue train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics