def main(): LEVEL = logging.INFO logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=LEVEL) start_time = time_to_str(time.time()) experiment_dir = f'experiments/{start_time}' parser = argparse.ArgumentParser() parser.add_argument( '--param-path', type=str, default='settings.jsonnet', help='path to parameter file describing the model to be trained') parser.add_argument( '-s', '--serialization-dir', type=str, default=experiment_dir, help='directory in which to save the model and its logs') parser.add_argument( '-r', '--recover', action='store_true', default=False, help='recover training from the state in serialization_dir') parser.add_argument('-f', '--force', action='store_true', required=False, help='overwrite the output directory if it exists') parser.add_argument( '-o', '--overrides', type=str, default="", help='a JSON structure used to override the experiment configuration') parser.add_argument( '--file-friendly-logging', action='store_true', default=False, help='outputs tqdm status on separate lines and slows tqdm refresh rate' ) args = parser.parse_args() import_submodules('modules') model, params = train(args)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: self.model.train() batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self.model.named_parameters()} self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size/batches_this_epoch logger.info(f"current batch size: {cur_batch} mean batch size: {average}") self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time))) ) if self._early_stopping_by_batch and self._batch_num_total % 10 == 0: if self._validation_data is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, num_batches = self._validation_loss() val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True) # Check validation metric for early stopping this_epoch_val_metric = val_metrics[self._validation_metric] self._metric_tracker.add_metric(this_epoch_val_metric) if self._metric_tracker.is_best_so_far(): metrics['best_batch'] = self._batch_num_total for key, value in val_metrics.items(): metrics["best_validation_" + key] = value self._metric_tracker.best_epoch_metrics = val_metrics self._save_checkpoint(self._batch_num_total) if self.callbacks is not None: for callback in self.callbacks: callback.on_batch_end(self._batch_num_total) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def semi_train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.trainer.model.train() num_gpus = len(self.trainer._cuda_devices) self.trainer._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self.trainer._batch_num_total is None: self.trainer._batch_num_total = 0 histogram_parameters = set( self.trainer.model. get_parameters_for_histogram_tensorboard_logging()) #Pdb().set_trace() mixed_generator, num_training_batches = get_mixer( self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled) #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer) #generator for lambda update mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm', 1.0) #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm') logger.info("Training") train_generator_tqdm = Tqdm.tqdm(mixed_generator, total=num_training_batches) #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator), # total=num_training_batches) cumulative_batch_size = 0 unlabelled_loss = 0 unlabelled_batches_this_epoch = 0 batches_since_last_step = 0 agg_loss = 0.0 flag = False batch_grad_norm = None for batch_group, group_id in train_generator_tqdm: #print(batch_group[0]['sentence']['tokens'].shape) if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id: continue output_dict = self.batch_loss( batch_group, for_training=True, eval_metric=(group_id == self.labelled_id)) penalties = defaultdict(float) if self.constraints_model is not None: penalties = self.constraints_model( output_dict['task1_tag_logits'], output_dict['task2_tag_logits'], output_dict['mask']) loss = 0.0 if 'loss' in output_dict: loss = output_dict['loss'] train_loss += loss.item() loss += output_dict.get('regularization_penalty', 0.0) loss += self.constraints_wt * penalties['loss'] unlabelled_loss += penalties['loss'].item() if torch.is_tensor( penalties['loss']) else penalties['loss'] agg_loss += loss batches_since_last_step += 1 if batches_since_last_step == self.backprop_after_xbatches: #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False else: flag = True #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) if (group_id != self.labelled_id): unlabelled_batches_this_epoch += 1 #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() #self.trainer.optimizer.step() else: self.total_supervised_iters += 1.0 batches_this_epoch += 1 self.trainer._batch_num_total += 1 batch_num_total = self.trainer._batch_num_total #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self.trainer._learning_rate_scheduler: self.trainer._learning_rate_scheduler.step_batch( batch_num_total) if self.trainer._tensorboard.should_log_histograms_this_batch( ): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.trainer.model.named_parameters() } #self.trainer.optimizer.step() for name, param in self.trainer.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self.trainer._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: pass #self.trainer.optimizer.step() # Update moving averages if self.trainer._moving_average is not None: self.trainer._moving_average.apply(batch_num_total) # metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch) metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) # Update the description with the latest metrics description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self.trainer._tensorboard.should_log_this_batch( ) and batch_grad_norm is not None: self.trainer._tensorboard.log_parameter_and_gradient_statistics( self.trainer.model, batch_grad_norm) self.trainer._tensorboard.log_learning_rates( self.trainer.model, self.trainer.optimizer) self.trainer._tensorboard.add_train_scalar( "loss/loss_train", metrics["loss"]) self.trainer._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.trainer._tensorboard.should_log_histograms_this_batch( ): self.trainer._tensorboard.log_histograms( self.trainer.model, histogram_parameters) if self.trainer._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self.trainer._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self.trainer._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self.trainer._tensorboard.add_train_scalar( "mean_batch_size", average) # Save model if needed. if self.trainer._model_save_interval is not None and ( time.time() - last_save_time > self.trainer._model_save_interval): last_save_time = time.time() self.trainer._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters ) and (self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by if flag: batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters) and ( self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage metrics['lb'] = batches_this_epoch metrics['ub'] = unlabelled_batches_this_epoch metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) if self.constraints_model is not None: lambda_stats_dict = self.constraints_model.lambda_stats() metrics.update(lambda_stats_dict) for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() images = [] text = [] segment_ids = [] labels = [] for i in range(len(batch_group[0]['images'])): positive_index = random.randint(0, self.num_negative_samples) labels.append(positive_index) if self.retrieve_text: instance_text = [] instance_segment_ids = [] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_text.append(batch_group[0]['token_ids'] ['tokens'][i, :].tolist()) instance_segment_ids.append( batch_group[0]['segment_ids'][i].tolist()) else: negative_sample_index = random.choice( self.train_indices) text_field = TextField( self.train_text_db[negative_sample_index], self.train_token_indexers) text_field.index(self.model.vocab) padding_lengths = text_field.get_padding_lengths() instance_text.append( text_field.as_tensor( padding_lengths=padding_lengths) ['tokens'].tolist()) instance_segment_ids.append( self.train_segment_ids_db[ negative_sample_index].tolist()) text += instance_text segment_ids += instance_segment_ids else: instance_images = [ None for _ in range(self.num_negative_samples + 1) ] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_images[j] = np.expand_dims( batch_group[0]['images'][i].numpy(), 0) else: instance_images[j] = np.expand_dims( random.choice(self.train_image_db), 0) images += instance_images matching_label_field_name = "labels" if self.retrieve_text: max_text_len = max([len(sequence) for sequence in text]) text = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in text ] batch_group[0]['token_ids'] = { 'tokens': torch.LongTensor(text) } segment_ids = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in segment_ids ] batch_group[0]['segment_ids'] = torch.from_numpy( np.array(segment_ids, dtype=np.int64)) else: batch_group[0]['images'] = torch.from_numpy(np.vstack(images)) batch_group[0][matching_label_field_name] = torch.from_numpy( np.array(labels, dtype=np.int64)) loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps ) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() self._writer.log({"step_loss": loss.item()}, step=self._batch_num_total) train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._writer.log({"lr": self.optimizer.param_groups[0]['lr']}, step=self._batch_num_total) # Save model if needed. if ( self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master ): last_save_time = time.time() self._save_checkpoint( "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time))) ) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # 如果没有gpu ,也返回1. # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) residue = num_training_batches % self.accumulated_batch_count self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm( train_generator, total=num_training_batches) # 打印一个进度条而已. cumulative_batch_size = 0 self.optimizer.zero_grad() for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total iter_len = self.accumulated_batch_count \ if batches_this_epoch <= (num_training_batches - residue) else residue if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) try: loss = self.batch_loss( batch_group, for_training=True) / iter_len # 输入的数据里面去除了全部都是keep的情况 except RuntimeError as e: print(e) for x in batch_group: all_words = [len(y['words']) for y in x['metadata']] print(f"Total sents: {len(all_words)}. " f"Min {min(all_words)}. Max {max(all_words)}") for elem in ['labels', 'd_tags']: tt = x[elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) for elem in ["bert", "mask", "bert-offsets"]: tt = x['tokens'][elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) raise e if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) train_loss += loss.item() * iter_len del batch_group, loss torch.cuda.empty_cache() # 节省内存,显存 if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() #多个batch才进行bp算法. self.optimizer.zero_grad() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) # 计算准确率 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. 取一个间隔来存 if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_loss_lang1 = 0.0 train_loss_lang2 = 0.0 train_loss_cm = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() self.optimizer_lang1.zero_grad() self.optimizer_lang2.zero_grad() self.optimizer_cm.zero_grad() loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss( batch_group, for_training=True) if torch.isnan(loss): # if either on of loss_%s is nan, loss will be nan raise ValueError("nan loss encountered") ####### # lang1 ####### loss_lang1.backward() train_loss_lang1 += loss_lang1.item() self.rescale_gradients() if self._learning_rate_scheduler_lang1: self._learning_rate_scheduler_lang1.step_batch(batch_num_total) if self._momentum_scheduler_lang1: self._momentum_scheduler_lang1.step_batch(batch_num_total) self.optimizer_lang1.step() self.optimizer_lang1.zero_grad() ####### # cm ####### loss_lang2.backward() train_loss_lang2 += loss_lang2.item() batch_grad_norm = self.rescale_gradients() if self._learning_rate_scheduler_lang2: self._learning_rate_scheduler_lang2.step_batch(batch_num_total) if self._momentum_scheduler_lang2: self._momentum_scheduler_lang2.step_batch(batch_num_total) self.optimizer_lang2.step() self.optimizer_lang2.zero_grad() ####### # lang2 ####### loss_cm.backward() train_loss_cm += loss_cm.item() self.rescale_gradients() if self._learning_rate_scheduler_cm: self._learning_rate_scheduler_cm.step_batch(batch_num_total) if self._momentum_scheduler_cm: self._momentum_scheduler_cm.step_batch(batch_num_total) self.optimizer_cm.step() self.optimizer_cm.zero_grad() train_loss += loss.item() # Update the description with the latest metrics # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) metrics = self.model.get_metrics(False) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang1) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang2) self._tensorboard.log_learning_rates(self.model, self.optimizer_cm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.add_train_scalar("loss/cm_loss_train", metrics["cm_loss"]) self._tensorboard.add_train_scalar("loss/lang1_loss_train", metrics["lang1_loss"]) self._tensorboard.add_train_scalar("loss/lang2_loss_train", metrics["lang2_loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics = self.model.get_metrics(reset=True) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() num_training_batches = [math.ceil( self.iterator.get_num_batches(train_data) / self._num_gradient_accumulation_steps ) for task, train_data in self.train_datas.items()] assert len(set(num_training_batches)) == 1, "num_training_batches doesn't agree" tasks = list(self.batch_group_generators.keys()) num_tasks = len(tasks) #if isinstance(self._learning_rate_scheduler, SlantedTriangular): # old_num_steps_per_epoch = self._learning_rate_scheduler.num_steps_per_epoch # self._learning_rate_scheduler.num_steps_per_epoch = num_training_batches[0] # logger.info(f"modify num_steps_per_epoch of lr scheduler from" # f"{old_num_steps_per_epoch} to {num_training_batches}") self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 logger.info("Training") cumulative_batch_group_size = 0 tqdm_bar = Tqdm.tqdm(range(num_training_batches[0])) for _ in tqdm_bar: randperms = torch.randperm(len(tasks)).tolist() sampled_tasks = [tasks[idx] for idx in randperms[:self._tasks_per_step]] sampled_task_generators = [next(self.batch_group_generators[task]) for task in sampled_tasks] batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() task_metrics = self.wrapper(tasks=sampled_task_generators, train=True, meta_train=True) losses = [list(map(lambda x: x["loss"], metrics)) for metrics in task_metrics] LASes = [list(map(lambda x: x["metric"]["LAS"], metrics)) for metrics in task_metrics] names = ["loss", "LAS"] list_values = [losses, LASes] if self.has_VIB: KLDivs = [list(map(lambda x: x["metric"]["kl_div"], metrics)) for metrics in task_metrics] names.append("KLDiv") list_values.append(KLDivs) if self.has_pos: pos_accs = [list(map(lambda x: x["metric"].get("pos_accuracy", 0.0), metrics)) for metrics in task_metrics] names.append("pos_acc") list_values.append(pos_accs) for name, values in zip(names, list_values): self._writer.log({f"step_{name}_{task}_{i}": value for task, task_values in zip(sampled_tasks, values) for i, value in enumerate(task_values)}, step=self._batch_num_total) values_inner_steps = list(map(np.mean, zip(*values))) self._writer.log({f"step_{name}_{i}": value for i, value in enumerate(values_inner_steps)}, step=self._batch_num_total) if name == "loss": train_loss += values_inner_steps[0] batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) # variational information bottleneck / meta-learning without memorization if self.has_VIB: kl_loss, kl_div, kl_div2 = ContinuousVIB.get_kl_loss(self.model, sampled_task_generators) kl_loss.backward() self._writer.log({"kl_loss": kl_loss.detach().item(), "kl_div": kl_div, "kl_div2": kl_div2}, step=self._batch_num_total) # adversarial training if self.task_D and self.optim_D: # D training self.optimizer.step() steps_per_update = self.task_D.steps_per_update if (batch_num_total - 1) % steps_per_update == 0: self.optim_D.zero_grad() hidden_states, labels, masks = self.task_D.get_hidden_states( self.model, sampled_task_generators ) D_loss, _, acc = self.task_D(hidden_states, labels, masks, detach=True) D_loss.backward() disc_grad_norm = training_util.rescale_gradients(self.task_D, self.task_D.disc_grad_norm) self.optim_D.step() self._writer.log({"D_loss": D_loss.detach().item(), "D_acc": acc}, step=self._batch_num_total) if disc_grad_norm: self._writer.log({"D_grad_norm": disc_grad_norm.detach().item()}, step=self._batch_num_total) # G training hidden_states, labels, masks = self.task_D.get_hidden_states( self.model, sampled_task_generators ) _, g_loss, acc = self.task_D(hidden_states, labels, masks) if self.task_D.weight: alpha = self.task_D.weight else: alpha = self.task_D.get_alpha(self._batch_num_total, num_training_batches[0] * self._num_epochs) G_loss = -alpha * g_loss G_loss.backward() gen_grad_norm = training_util.rescale_gradients(self.model, self.task_D.gen_grad_norm) self._writer.log({"G_loss": g_loss.detach().item(), "alpha": alpha, "G_acc": acc}, step=self._batch_num_total) if gen_grad_norm: self._writer.log({"G_grad_norm": gen_grad_norm.detach().item()}, step=self._batch_num_total) self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.wrapper.container, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) tqdm_bar.set_description(description, refresh=False) # log learning rate. self._writer.log({"lr": self.optimizer.param_groups[0]['lr']}, step=self._batch_num_total) # Save model if needed. if ( self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master ): last_save_time = time.time() self._save_checkpoint( "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time))) ) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.wrapper.container, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() # Get tqdm for the training batches train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) num_training_batches = self.iterator.get_num_batches(self.train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 self.optimizer.zero_grad() for batch_id, batch in enumerate(train_generator_tqdm): batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if (batch_id + 1) % self._accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if (batch_id + 1) % self._accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = training_util.get_batch_size(batch) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() #num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) #train_generator = lazy_groups_of(raw_train_generator, num_gpus) #num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) num_training_batches = 1 self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(raw_train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch, lr_mult in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() # batch_grad_norm = self.rescale_gradients() if self._grad_clipping: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self._grad_clipping) # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) # We dynamically adjust the learning rate to account for slight variations in the input # sequences original_lr = self.optimizer.param_groups[0]['lr'] batch_lr = original_lr * lr_mult self.optimizer.param_groups[0]['lr'] = batch_lr if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() self.optimizer.param_groups[0]['lr'] = original_lr # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): # self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains on one epoch. Differs from base trainer in that it utilizes """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) raw_generators = [] # fix max number of batches self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_size = 0 for i in range(0, self.meta_batches): train_generators = [] for i, train_info in enumerate(self.train_data): raw_train_generator = self.iterator(train_info, num_epochs=1, shuffle=self.shuffle) train_generators.append( lazy_groups_of(raw_train_generator, num_gpus)) loss_batch = self.reptile_outer_update(train_generators, i, num_gpus) # TODO figure out if is important train_loss = loss_batch print('[info] train_loss is:{}'.format(train_loss)) # TODO figure out BATCH NORM MAML https://openreview.net/pdf?id=HygBZnRctX if self.batch_norm: batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. # TODO investigate learning rate scheduling for meta learning #if self._learning_rate_scheduler: #self._learning_rate_scheduler.step_batch(batch_num_total) #if self._momentum_scheduler: #self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches # 使训练数据可迭代 raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) # 将可迭代的单实例批处理到list中 train_generator = lazy_groups_of(raw_train_generator, num_gpus) # 向上取整 获取batch数 (总batch/gpu数) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) # 默认的accumulated batch count 为4,此处是求accumulate的尾巴 residue = num_training_batches % self.accumulated_batch_count self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") # 训练进度条 train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 # 梯度清零 常规操作 self.optimizer.zero_grad() # 开始训练 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # 一个batch为accumulated_batch_count个iteration,梯度累积 iter_len = self.accumulated_batch_count \ if batches_this_epoch <= (num_training_batches - residue) else residue if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) try: # 平均loss loss = self.batch_loss(batch_group, for_training=True) / iter_len except RuntimeError as e: print(e) for x in batch_group: all_words = [len(y['words']) for y in x['metadata']] print(f"Total sents: {len(all_words)}. " f"Min {min(all_words)}. Max {max(all_words)}") for elem in ['labels', 'd_tags']: tt = x[elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) for elem in ["bert", "mask", "bert-offsets"]: tt = x['tokens'][elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) raise e if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) if torch.isnan(loss): raise ValueError("nan loss encountered") # 反向传播 loss.backward() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) # 计算loss train_loss += loss.item() * iter_len # 删除两个变量 del batch_group, loss # pytorch 训练时无用的临时变量可能会越来越多,导致 out of memory ,可以使用下面语句来清理这些不需要的变量。 torch.cuda.empty_cache() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) # 正则化梯度 batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. # lr会在epoch变大的同时予以调整,一般是逐渐变小 # momentum 动量 防止损失函数陷入局部极小值,跳出鞍点 if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # copy参数 防止爆内存 # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: # 自动计算梯度 optimizer.step() self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) # 求l1范数 update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() self.optimizer.zero_grad() # Update moving averages 在adam或SGD优化中为了平衡模型更新速度一般设置滑动平均来提高模型在测试数据上的健壮性 if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of( raw_train_generator, num_gpus * self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / (num_gpus * self._num_gradient_accumulation_steps)) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: if not self._graph_added and self._require_graph: model_copy = deepcopy(self.model) model_copy.log_graph() wrapped_model = ModelWrapper(model_copy) graph_inputs = wrapped_model.process_inputs(batch_group[0]) # print(deepcopy(wrapped_model)(graph_inputs)) self._tensorboard.add_graph(wrapped_model, [graph_inputs]) self._graph_added = True batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() num_batch = len(batch_group) // num_gpus for i in range(num_batch): if (i + 1) * num_gpus > len(batch_group): batch_i = batch_group[i * num_gpus:] else: batch_i = batch_group[i * num_gpus:(i + 1) * num_gpus] loss = self.batch_loss(batch_i, for_training=True) if loss is None or torch.isnan(loss): print("nan loss") continue # raise ValueError("nan loss encountered") loss = loss / num_batch # try: # loss.backward() # except Exception: # print("loss: ", loss) # print(batch_group) # with torch.autograd.set_detect_anomaly(True): This can potentially lead to slower training # loss.backward() # loss = loss.half() loss.backward() # with amp.scale_loss(loss, self.optimizer) as scaled_loss: # try: # scaled_loss.backward() # except RuntimeError: # print("CUDA out of memory during backward()") # continue train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. only report system utils when we are local rank 0 at each machine. """ logger.info("Rank %d: Epoch %d/%d", self._rank, epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() if self._is_chief: logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() # should be 1 anyway, because we are only dealing with nprocess_with_ngpus num_gpus = len(self._cuda_device) # TODO: Implementation of whether the generator should take into account of worldsize. # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 # NOTE: only work in nprocess_ngpus device = torch.device("cuda:%d" % self._cuda_device[0]) for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._is_chief: # only chief do tensorboard if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() else: self.optimizer.step() # Update moving averages # NOTE: not sure whether this need to be average if self._moving_average is not None: self._moving_average.apply(batch_num_total) metrics = get_metrics(self.model, device, self._worldsize, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description( ("Rank %d: " % self._rank) + description, refresh=False) if self._is_chief: # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates( self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"rank {self._rank}, current batch size: {cur_batch} mean batch size: {average}" ) if self._is_chief: self._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self._tensorboard.add_train_scalar( "mean_batch_size", average) if self._is_chief: # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = get_metrics(self.model, device, self._worldsize, train_loss, batches_this_epoch) metrics['cpu_memory_MB'] = peak_cpu_usage return metrics