Python gpu_memory_mb 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: allennlp.common.util

메소드/함수: gpu_memory_mb

hotexamples.com에서의 예제들: 30

Python gpu_memory_mb - 30개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 allennlp.common.util.gpu_memory_mb에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        if torch.cuda.is_available():
            for gpu, memory in gpu_memory_mb().items():
                logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._optimizer.zero_grad()
            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            # Make sure Variable is on the cpu before converting to numpy.
            # .cpu() is a no-op if you aren't using GPUs.
            train_loss += loss.data.cpu().numpy()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)
            if hasattr(self, "_tf_params") and self._tf_params is not None:
                # We have TF logging
                if self._batch_num_total % self._tf_params["log_every"] == 0:
                    self._tf_log(metrics, self._batch_num_total)

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)

예제 #2

파일 보기

파일: wait_for_gpu.py 프로젝트: Shuailong/SPM

def monitor(min_memory: int, check_interval: int) -> List[Tuple[int]]:
    available_gpu = []
    while (not available_gpu):
        for gpu, memory in gpu_memory_mb().items():
            if memory < min_memory:
                available_gpu.append((gpu, memory))
        sleep(check_interval)
    return available_gpu

예제 #3

파일 보기

파일: track_metrics.py 프로젝트: zulushakaka/allennlp

 def measure_cpu_gpu(self, trainer: "CallbackTrainer"):
     # This used to be in train_epoch()
     logger.info("Epoch %d/%d", trainer.epoch_number,
                 trainer.num_epochs - 1)
     self.peak_cpu_usage = peak_memory_mb()
     logger.info(f"Peak CPU memory usage MB: {self.peak_cpu_usage}")
     self.gpu_usage.clear()
     for gpu, memory in gpu_memory_mb().items():
         self.gpu_usage.append((gpu, memory))
         logger.info(f"GPU {gpu} memory usage MB: {memory}")

예제 #4

파일 보기

파일: train_model.py 프로젝트: andre-vauvelle/doctor-signature

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {
            'accuracy': self.metrics['accuracy'].get_metric(reset=reset)
        }
        metrics.update({
            'average_precision':
            self.metrics['average_precision'].get_metric(reset=reset),
            'f1':
            self.metrics['f1_score'].get_metric(reset=reset),
            'auc':
            self.metrics['auc'].get_metric(reset=reset)
        })
        for (gpu_num, memory) in gpu_memory_mb().items():
            metrics.update(
                {'gpu_batch_' + str(gpu_num) + '_memory_MB': memory})

        return metrics

예제 #5

파일 보기

파일: trainer.py 프로젝트: wj-Mcat/allennlp

 def __call__(
     self,
     trainer: "GradientDescentTrainer",
     batch_inputs: List[List[TensorDict]],
     batch_outputs: List[Dict[str, Any]],
     epoch: int,
     batch_number: int,
     is_training: bool,
     is_master: bool,
 ) -> None:
     # In the distributed case we need to call this from every worker, since every
     # worker reports its own memory usage.
     cpu_memory_usage = common_util.peak_memory_mb()
     # But we only want to call `gpu_memory_mb` and `log_memory_usage` from the
     # master process.
     if is_master:
         gpu_memory_usage = common_util.gpu_memory_mb()
         trainer._tensorboard.log_memory_usage(cpu_memory_usage, gpu_memory_usage)

예제 #6

파일 보기

파일: trainer.py 프로젝트: sun-xiaoyu/allennlp

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_reg_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches = math.ceil(
            len(self.data_loader) / self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                batch_outputs = self.batch_outputs(batch, for_training=True)
                batch_group_outputs.append(batch_outputs)
                loss = batch_outputs["loss"]
                reg_loss = batch_outputs["reg_loss"]
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                reg_loss = reg_loss / len(batch_group)
                if self._opt_level is not None:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                train_loss += loss.item()
                train_reg_loss += reg_loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(self.model, self.optimizer,
                                            batch_grad_norm, metrics,
                                            batch_group, param_updates)

            if self._master:
                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)
                for callback in self._batch_callbacks:
                    callback(
                        self,
                        batch_group,
                        batch_group_outputs,
                        epoch,
                        batches_this_epoch,
                        is_training=True,
                    )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #7

파일 보기

파일: trainer.py 프로젝트: khu-nlplab/WiseKB-KHU-2019

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            # Make sure Variable is on the cpu before converting to numpy.
            # .cpu() is a no-op if you aren't using GPUs.
            train_loss += loss.data.cpu().numpy()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().data.cpu().clone()
                                 for name, param in self._model.named_parameters()}
                self._optimizer.step()
                for name, param in self._model.named_parameters():
                    param_updates[name].sub_(param.detach().data.cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, ))
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)

예제 #8

파일 보기

파일: tensorboard_writer.py 프로젝트: pps121/allennlp

 def log_memory_usage(self):
     cpu_memory_usage = peak_memory_mb()
     self.add_train_scalar("memory_usage/cpu", cpu_memory_usage)
     for gpu, memory in gpu_memory_mb().items():
         self.add_train_scalar(f"memory_usage/gpu_{gpu}", memory)

예제 #9

파일 보기

파일: trainer.py 프로젝트: zhangbo2008/pachong2

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)  # 如果没有gpu ,也返回1.

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        residue = num_training_batches % self.accumulated_batch_count
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(
            train_generator, total=num_training_batches)  # 打印一个进度条而已.
        cumulative_batch_size = 0
        self.optimizer.zero_grad()
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            iter_len = self.accumulated_batch_count \
                if batches_this_epoch <= (num_training_batches - residue) else residue

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            try:
                loss = self.batch_loss(
                    batch_group,
                    for_training=True) / iter_len  # 输入的数据里面去除了全部都是keep的情况
            except RuntimeError as e:
                print(e)
                for x in batch_group:
                    all_words = [len(y['words']) for y in x['metadata']]
                    print(f"Total sents: {len(all_words)}. "
                          f"Min {min(all_words)}. Max {max(all_words)}")
                    for elem in ['labels', 'd_tags']:
                        tt = x[elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                    for elem in ["bert", "mask", "bert-offsets"]:
                        tt = x['tokens'][elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                raise e

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            train_loss += loss.item() * iter_len

            del batch_group, loss
            torch.cuda.empty_cache()  # 节省内存,显存

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    self.optimizer.step()  #多个batch才进行bp算法.
                    self.optimizer.zero_grad()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)  # 计算准确率
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed. 取一个间隔来存
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #10

파일 보기

파일: trainer_grad_accum.py 프로젝트: tbmihailov/discourse-aware-semantic-self-attention

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        # Get tqdm for the training batches
        train_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        num_training_batches = self.iterator.get_num_batches(self.train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0

        self.optimizer.zero_grad()
        for batch_id, batch in enumerate(train_generator_tqdm):
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            loss = self.batch_loss(batch, for_training=True)
            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                if (batch_id + 1) % self._accumulation_steps == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                if (batch_id + 1) % self._accumulation_steps == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = training_util.get_batch_size(batch)
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics

예제 #11

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        num_training_batches = [math.ceil(
            self.iterator.get_num_batches(train_data) / self._num_gradient_accumulation_steps
        ) for task, train_data in self.train_datas.items()]
        assert len(set(num_training_batches)) == 1, "num_training_batches doesn't agree"
        tasks = list(self.batch_group_generators.keys())
        num_tasks = len(tasks)

        #if isinstance(self._learning_rate_scheduler, SlantedTriangular):
        #    old_num_steps_per_epoch = self._learning_rate_scheduler.num_steps_per_epoch
        #    self._learning_rate_scheduler.num_steps_per_epoch = num_training_batches[0]
        #    logger.info(f"modify num_steps_per_epoch of lr scheduler from"
        #                f"{old_num_steps_per_epoch} to {num_training_batches}")

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        logger.info("Training")

        cumulative_batch_group_size = 0
        tqdm_bar = Tqdm.tqdm(range(num_training_batches[0]))
        for _ in tqdm_bar:
            randperms = torch.randperm(len(tasks)).tolist()
            sampled_tasks = [tasks[idx] for idx in randperms[:self._tasks_per_step]]
            sampled_task_generators = [next(self.batch_group_generators[task]) for task in sampled_tasks]

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            task_metrics = self.wrapper(tasks=sampled_task_generators, train=True, meta_train=True)

            losses = [list(map(lambda x: x["loss"], metrics)) for metrics in task_metrics]
            LASes = [list(map(lambda x: x["metric"]["LAS"], metrics)) for metrics in task_metrics]

            names = ["loss", "LAS"]
            list_values = [losses, LASes]

            if self.has_VIB:
                KLDivs = [list(map(lambda x: x["metric"]["kl_div"], metrics)) for metrics in task_metrics]
                names.append("KLDiv")
                list_values.append(KLDivs)

            if self.has_pos:
                pos_accs = [list(map(lambda x: x["metric"].get("pos_accuracy", 0.0), metrics)) for metrics in task_metrics]
                names.append("pos_acc")
                list_values.append(pos_accs)


            for name, values in zip(names, list_values):
                self._writer.log({f"step_{name}_{task}_{i}": value
                                  for task, task_values in zip(sampled_tasks, values)
                                  for i, value in enumerate(task_values)},
                                 step=self._batch_num_total)
                values_inner_steps = list(map(np.mean, zip(*values)))
                self._writer.log({f"step_{name}_{i}": value for i, value in
                                  enumerate(values_inner_steps)},
                                 step=self._batch_num_total)
                if name == "loss":
                    train_loss += values_inner_steps[0]

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            # variational information bottleneck / meta-learning without memorization
            if self.has_VIB:
                kl_loss, kl_div, kl_div2 = ContinuousVIB.get_kl_loss(self.model, sampled_task_generators)
                kl_loss.backward()
                self._writer.log({"kl_loss": kl_loss.detach().item(),
                                  "kl_div": kl_div,
                                  "kl_div2": kl_div2},
                                  step=self._batch_num_total)

            # adversarial training
            if self.task_D and self.optim_D:
                # D training
                self.optimizer.step()
                steps_per_update = self.task_D.steps_per_update
                if (batch_num_total - 1) % steps_per_update == 0:
                    self.optim_D.zero_grad()
                    hidden_states, labels, masks = self.task_D.get_hidden_states(
                        self.model,
                        sampled_task_generators
                    )
                    D_loss, _, acc = self.task_D(hidden_states, labels, masks, detach=True)
                    D_loss.backward()
                    disc_grad_norm = training_util.rescale_gradients(self.task_D, self.task_D.disc_grad_norm)
                    self.optim_D.step()
                    self._writer.log({"D_loss": D_loss.detach().item(),
                                      "D_acc": acc},
                                     step=self._batch_num_total)
                    if disc_grad_norm:
                        self._writer.log({"D_grad_norm": disc_grad_norm.detach().item()},
                                         step=self._batch_num_total)

                # G training
                hidden_states, labels, masks = self.task_D.get_hidden_states(
                    self.model,
                    sampled_task_generators
                )
                _, g_loss, acc = self.task_D(hidden_states, labels, masks)
                if self.task_D.weight:
                    alpha = self.task_D.weight
                else:
                    alpha = self.task_D.get_alpha(self._batch_num_total,
                                                  num_training_batches[0] * self._num_epochs)
                G_loss = -alpha * g_loss
                G_loss.backward()
                gen_grad_norm = training_util.rescale_gradients(self.model, self.task_D.gen_grad_norm)
                self._writer.log({"G_loss": g_loss.detach().item(), "alpha": alpha, "G_acc": acc},
                                 step=self._batch_num_total)
                if gen_grad_norm:
                    self._writer.log({"G_grad_norm": gen_grad_norm.detach().item()},
                                     step=self._batch_num_total)

            self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)


            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.wrapper.container,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                tqdm_bar.set_description(description, refresh=False)

            # log learning rate.
            self._writer.log({"lr": self.optimizer.param_groups[0]['lr']},
                             step=self._batch_num_total)

            # Save model if needed.
            if (
                self._model_save_interval is not None
                and (time.time() - last_save_time > self._model_save_interval)
                and self._master
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                    "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time)))
                )

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.wrapper.container,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #12

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        #num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        #train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        #num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus)
        num_training_batches = 1
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(raw_train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch, lr_mult in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            # batch_grad_norm = self.rescale_gradients()
            if self._grad_clipping:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self._grad_clipping)

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            # We dynamically adjust the learning rate to account for slight variations in the input
            # sequences
            original_lr = self.optimizer.param_groups[0]['lr']
            batch_lr = original_lr * lr_mult
            self.optimizer.param_groups[0]['lr'] = batch_lr

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            self.optimizer.param_groups[0]['lr'] = original_lr

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                # self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics

예제 #13

파일 보기

파일: metatrainer.py 프로젝트: ha-lins/medical_dialog

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains on one epoch. Differs from base trainer in that 
        it utilizes
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)
        raw_generators = []

        # fix max number of batches
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_size = 0
        for i in range(0, self.meta_batches):
            train_generators = []
            for i, train_info in enumerate(self.train_data):
                raw_train_generator = self.iterator(train_info,
                                                    num_epochs=1,
                                                    shuffle=self.shuffle)
                train_generators.append(
                    lazy_groups_of(raw_train_generator, num_gpus))

            loss_batch = self.reptile_outer_update(train_generators, i,
                                                   num_gpus)

            # TODO figure out if is important
            train_loss = loss_batch
            print('[info] train_loss is:{}'.format(train_loss))

            # TODO figure out BATCH NORM MAML https://openreview.net/pdf?id=HygBZnRctX
            if self.batch_norm:
                batch_grad_norm = self.rescale_gradients()
            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            # TODO investigate learning rate scheduling for meta learning
            #if self._learning_rate_scheduler:
            #self._learning_rate_scheduler.step_batch(batch_num_total)
            #if self._momentum_scheduler:
            #self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics

예제 #14

파일 보기

파일: trainer.py 프로젝트: valueable/GEC

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        # 使训练数据可迭代
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        # 将可迭代的单实例批处理到list中
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        # 向上取整 获取batch数 （总batch/gpu数）
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        # 默认的accumulated batch count 为4，此处是求accumulate的尾巴
        residue = num_training_batches % self.accumulated_batch_count
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        # 训练进度条
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        # 梯度清零 常规操作
        self.optimizer.zero_grad()
        # 开始训练
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total
            # 一个batch为accumulated_batch_count个iteration，梯度累积
            iter_len = self.accumulated_batch_count \
                if batches_this_epoch <= (num_training_batches - residue) else residue

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            try:  # 平均loss
                loss = self.batch_loss(batch_group,
                                       for_training=True) / iter_len
            except RuntimeError as e:
                print(e)
                for x in batch_group:
                    all_words = [len(y['words']) for y in x['metadata']]
                    print(f"Total sents: {len(all_words)}. "
                          f"Min {min(all_words)}. Max {max(all_words)}")
                    for elem in ['labels', 'd_tags']:
                        tt = x[elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                    for elem in ["bert", "mask", "bert-offsets"]:
                        tt = x['tokens'][elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                raise e

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")
            # 反向传播
            loss.backward()

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            # 计算loss
            train_loss += loss.item() * iter_len
            # 删除两个变量
            del batch_group, loss
            # pytorch 训练时无用的临时变量可能会越来越多，导致 out of memory ，可以使用下面语句来清理这些不需要的变量。
            torch.cuda.empty_cache()

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            # 正则化梯度
            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            # lr会在epoch变大的同时予以调整，一般是逐渐变小
            # momentum 动量 防止损失函数陷入局部极小值，跳出鞍点
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # copy参数 防止爆内存
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    # 自动计算梯度 optimizer.step()
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    # 求l1范数
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

            # Update moving averages 在adam或SGD优化中为了平衡模型更新速度一般设置滑动平均来提高模型在测试数据上的健壮性
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #15

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self._model.named_parameters()}
                self._optimizer.step()
                for name, param in self._model.named_parameters():

예제 #16

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
        batch_group_generator = lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps
        )

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                self._writer.log({"step_loss": loss.item()}, step=self._batch_num_total)
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

            self._writer.log({"lr": self.optimizer.param_groups[0]['lr']},
                             step=self._batch_num_total)

            # Save model if needed.
            if (
                self._model_save_interval is not None
                and (time.time() - last_save_time > self._model_save_interval)
                and self._master
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                    "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time)))
                )

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #17

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        ############
        # Training #
        ############
        logger.info("Training")
        for batch in train_generator:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            ########
            # loss #
            ########
            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            # Make sure Variable is on the cpu before converting to numpy.
            # .cpu() is a no-op if you aren't using GPUs.
            train_loss += loss.data.cpu().numpy()

            ########################
            # Update Learning Rate #
            ########################
            self._update_learning_rate(None, batch_num_total=batch_num_total)

            #################
            # Update Params #
            #################
            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().data.cpu().clone()
                    for name, param in self._model.named_parameters()
                }
                self._optimizer.step()
                for name, param in self._model.named_parameters():
                    param_updates[name].sub_(param.detach().data.cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, ))
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7), batch_num_total)
            else:
                self._optimizer.step()

            #################
            # Print Metrics #
            #################
            # Update the description with the latest metrics
            if batches_this_epoch % 10 == 0:
                metrics = self._get_metrics(train_loss, batches_this_epoch)
                description = self._description_from_metrics(metrics)
                sys.stdout.write("At %d-th batch: %s\n" %
                                 (batches_this_epoch, description))
                sys.stdout.flush()

            ##############
            # Save model #
            ##############
            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, time_to_str(int(last_save_time))), [],
                                      is_best=False)

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)

예제 #18

파일 보기

파일: meta_trainer.py 프로젝트: michaeljneely/model-uncertainty-pos-tagging

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info(f"Epoch: {epoch}/{self._num_epochs - 1}")
        cpu_memory_usage = []
        for worker, memory in common_util.peak_memory_mb().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage MB: {memory}")
        gpu_memory_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        for component_optimizer in self.component_optimizers.values():
            component_optimizer.reset_loss('train')

        self.model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps)
        except TypeError:
            num_training_batches = float("inf")

        batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                               total=num_training_batches)

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False

        for batch_group in batch_group_generator_tqdm:

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            for component_optimizer in self.component_optimizers.values():
                component_optimizer.zero_grad()

            batch_group_metrics = []

            meta_batch = deepcopy(batch_group)

            # Train the Sub Models first
            for name, sub_model in self._pytorch_model.component_models.items(
            ):
                component_optimizer = self.component_optimizers[name]
                batch_group_outputs, metrics = component_optimizer.process_batch_group(
                    batch_group, True, batch_num_total, batches_this_epoch,
                    True)
                batch_group_metrics.append(metrics)

                for i, batch_outputs in enumerate(batch_group_outputs):
                    component_output = batch_outputs["output"]
                    component_output = component_output.detach()
                    meta_batch[i][name] = component_output

            meta_optimizer = self.component_optimizers["meta"]
            meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group(
                meta_batch, True, batch_num_total, batches_this_epoch, False)

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            batch_group_metrics.append(meta_metrics)

            all_metrics = ChainMap(*batch_group_metrics)

            description = training_util.description_from_metrics(all_metrics)
            batch_group_generator_tqdm.set_description(description,
                                                       refresh=False)

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory

        return all_metrics

예제 #19

파일 보기

파일: gan_trainer_hm.py 프로젝트: djin31/loss-landscape

    def semi_train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.trainer.model.train()

        num_gpus = len(self.trainer._cuda_devices)

        self.trainer._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self.trainer._batch_num_total is None:
            self.trainer._batch_num_total = 0

        histogram_parameters = set(
            self.trainer.model.
            get_parameters_for_histogram_tensorboard_logging())
        #Pdb().set_trace()
        mixed_generator, num_training_batches = get_mixer(
            self.trainer.iterator, self.trainer.train_data,
            self.trainer.iterator, self.unlabelled_dataset, num_gpus,
            self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled)
        #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer)

        #generator for lambda update
        mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator,
                                                  self.trainer.train_data,
                                                  self.trainer.iterator,
                                                  self.unlabelled_dataset,
                                                  num_gpus, self.labelled_id,
                                                  'cm', 1.0)
        #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm')

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(mixed_generator,
                                         total=num_training_batches)
        #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator),
        #                                 total=num_training_batches)
        cumulative_batch_size = 0
        unlabelled_loss = 0
        unlabelled_batches_this_epoch = 0

        batches_since_last_step = 0
        agg_loss = 0.0
        flag = False
        batch_grad_norm = None
        for batch_group, group_id in train_generator_tqdm:
            #print(batch_group[0]['sentence']['tokens'].shape)
            if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id:
                continue
            output_dict = self.batch_loss(
                batch_group,
                for_training=True,
                eval_metric=(group_id == self.labelled_id))
            penalties = defaultdict(float)

            if self.constraints_model is not None:
                penalties = self.constraints_model(
                    output_dict['task1_tag_logits'],
                    output_dict['task2_tag_logits'], output_dict['mask'])

            loss = 0.0
            if 'loss' in output_dict:
                loss = output_dict['loss']
                train_loss += loss.item()
            loss += output_dict.get('regularization_penalty', 0.0)

            loss += self.constraints_wt * penalties['loss']

            unlabelled_loss += penalties['loss'].item() if torch.is_tensor(
                penalties['loss']) else penalties['loss']

            agg_loss += loss
            batches_since_last_step += 1

            if batches_since_last_step == self.backprop_after_xbatches:
                #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))
                batch_grad_norm = self.step(agg_loss)
                batches_since_last_step = 0
                agg_loss = 0.0
                flag = False
            else:
                flag = True
                #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))

            if (group_id != self.labelled_id):
                unlabelled_batches_this_epoch += 1
                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()
                #self.trainer.optimizer.step()
            else:
                self.total_supervised_iters += 1.0
                batches_this_epoch += 1
                self.trainer._batch_num_total += 1
                batch_num_total = self.trainer._batch_num_total

                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()

                # This does nothing if batch_num_total is None or you are using an
                # LRScheduler which doesn't update per batch.
                if self.trainer._learning_rate_scheduler:
                    self.trainer._learning_rate_scheduler.step_batch(
                        batch_num_total)

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    # get the magnitude of parameter updates for logging
                    # We need a copy of current parameters to compute magnitude of updates,
                    # and copy them to CPU so large models won't go OOM on the GPU.
                    param_updates = {
                        name: param.detach().cpu().clone()
                        for name, param in
                        self.trainer.model.named_parameters()
                    }
                    #self.trainer.optimizer.step()
                    for name, param in self.trainer.model.named_parameters():
                        param_updates[name].sub_(param.detach().cpu())
                        update_norm = torch.norm(param_updates[name].view(
                            -1, ))
                        param_norm = torch.norm(param.view(-1, )).cpu()
                        self.trainer._tensorboard.add_train_scalar(
                            "gradient_update/" + name,
                            update_norm / (param_norm + 1e-7))
                else:
                    pass
                    #self.trainer.optimizer.step()

                # Update moving averages
                if self.trainer._moving_average is not None:
                    self.trainer._moving_average.apply(batch_num_total)
            #
                metrics = training_util.get_metrics(self.trainer.model,
                                                    train_loss,
                                                    batches_this_epoch)
                metrics["uloss"] = float(
                    unlabelled_loss /
                    (batches_this_epoch + unlabelled_batches_this_epoch))
                # Update the description with the latest metrics
                description = training_util.description_from_metrics(metrics)
                train_generator_tqdm.set_description(description,
                                                     refresh=False)

                # Log parameter values to Tensorboard
                if self.trainer._tensorboard.should_log_this_batch(
                ) and batch_grad_norm is not None:
                    self.trainer._tensorboard.log_parameter_and_gradient_statistics(
                        self.trainer.model, batch_grad_norm)
                    self.trainer._tensorboard.log_learning_rates(
                        self.trainer.model, self.trainer.optimizer)

                    self.trainer._tensorboard.add_train_scalar(
                        "loss/loss_train", metrics["loss"])
                    self.trainer._tensorboard.log_metrics(
                        {"epoch_metrics/" + k: v
                         for k, v in metrics.items()})

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    self.trainer._tensorboard.log_histograms(
                        self.trainer.model, histogram_parameters)

                if self.trainer._log_batch_size_period:
                    cur_batch = sum([
                        training_util.get_batch_size(batch)
                        for batch in batch_group
                    ])
                    cumulative_batch_size += cur_batch
                    if (batches_this_epoch -
                            1) % self.trainer._log_batch_size_period == 0:
                        average = cumulative_batch_size / batches_this_epoch
                        logger.info(
                            f"current batch size: {cur_batch} mean batch size: {average}"
                        )
                        self.trainer._tensorboard.add_train_scalar(
                            "current_batch_size", cur_batch)
                        self.trainer._tensorboard.add_train_scalar(
                            "mean_batch_size", average)

                # Save model if needed.
                if self.trainer._model_save_interval is not None and (
                        time.time() - last_save_time >
                        self.trainer._model_save_interval):
                    last_save_time = time.time()
                    self.trainer._save_checkpoint('{0}.{1}'.format(
                        epoch, training_util.time_to_str(int(last_save_time))))

            #lambda update
            #if  (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0):
            if (self.constraints_model
                    is not None) and (self.dd_optimizer is not None) and (
                        self.total_supervised_iters >= self.dd_warmup_iters
                    ) and (self.total_supervised_iters -
                           self.last_lambda_update >= self.dd_update_freq):
                for batch_group, group_id in mixed_generator_for_lambda:
                    self.lambda_update(batch_group)
                    self.last_lambda_update = self.total_supervised_iters
                    break

                self.count_lambda_updates += 1
                if (self.dd_increase_freq_after
                        is not None) and (self.count_lambda_updates %
                                          self.dd_increase_freq_after == 0):
                    self.dd_update_freq += self.dd_increase_freq_by
        if flag:
            batch_grad_norm = self.step(agg_loss)
            batches_since_last_step = 0
            agg_loss = 0.0
            flag = False

        #lambda update
        #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters):
        if (self.constraints_model
                is not None) and (self.dd_optimizer is not None) and (
                    self.total_supervised_iters >= self.dd_warmup_iters) and (
                        self.total_supervised_iters - self.last_lambda_update
                        >= self.dd_update_freq):
            for batch_group, group_id in mixed_generator_for_lambda:
                self.lambda_update(batch_group)
                self.last_lambda_update = self.total_supervised_iters
                break

            self.count_lambda_updates += 1
            if (self.dd_increase_freq_after
                    is not None) and (self.count_lambda_updates %
                                      self.dd_increase_freq_after == 0):
                self.dd_update_freq += self.dd_increase_freq_by

        metrics = training_util.get_metrics(self.trainer.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        metrics['lb'] = batches_this_epoch
        metrics['ub'] = unlabelled_batches_this_epoch
        metrics["uloss"] = float(
            unlabelled_loss /
            (batches_this_epoch + unlabelled_batches_this_epoch))
        if self.constraints_model is not None:
            lambda_stats_dict = self.constraints_model.lambda_stats()
            metrics.update(lambda_stats_dict)
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics

예제 #20

파일 보기

파일: MultiTaskTrainer.py 프로젝트: LCS2-IIITD/allennlp_codemixed_language_models

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_loss_lang1 = 0.0
        train_loss_lang2 = 0.0
        train_loss_cm = 0.0

        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0

        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()
            self.optimizer_lang1.zero_grad()
            self.optimizer_lang2.zero_grad()
            self.optimizer_cm.zero_grad()

            loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss(
                batch_group, for_training=True)

            if torch.isnan(loss):
                # if either on of loss_%s is nan, loss will be nan
                raise ValueError("nan loss encountered")

            #######
            # lang1
            #######
            loss_lang1.backward()
            train_loss_lang1 += loss_lang1.item()
            self.rescale_gradients()

            if self._learning_rate_scheduler_lang1:
                self._learning_rate_scheduler_lang1.step_batch(batch_num_total)
            if self._momentum_scheduler_lang1:
                self._momentum_scheduler_lang1.step_batch(batch_num_total)

            self.optimizer_lang1.step()
            self.optimizer_lang1.zero_grad()

            #######
            # cm
            #######
            loss_lang2.backward()
            train_loss_lang2 += loss_lang2.item()
            batch_grad_norm = self.rescale_gradients()

            if self._learning_rate_scheduler_lang2:
                self._learning_rate_scheduler_lang2.step_batch(batch_num_total)
            if self._momentum_scheduler_lang2:
                self._momentum_scheduler_lang2.step_batch(batch_num_total)

            self.optimizer_lang2.step()
            self.optimizer_lang2.zero_grad()

            #######
            # lang2
            #######
            loss_cm.backward()
            train_loss_cm += loss_cm.item()
            self.rescale_gradients()

            if self._learning_rate_scheduler_cm:
                self._learning_rate_scheduler_cm.step_batch(batch_num_total)
            if self._momentum_scheduler_cm:
                self._momentum_scheduler_cm.step_batch(batch_num_total)

            self.optimizer_cm.step()
            self.optimizer_cm.zero_grad()

            train_loss += loss.item()

            # Update the description with the latest metrics
            # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            metrics = self.model.get_metrics(False)
            metrics["loss"] = float(
                train_loss /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["cm_loss"] = float(
                train_loss_cm /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["lang1_loss"] = float(
                train_loss_lang1 /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["lang2_loss"] = float(
                train_loss_lang2 /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_lang1)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_lang2)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_cm)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.add_train_scalar("loss/cm_loss_train",
                                                   metrics["cm_loss"])
                self._tensorboard.add_train_scalar("loss/lang1_loss_train",
                                                   metrics["lang1_loss"])
                self._tensorboard.add_train_scalar("loss/lang2_loss_train",
                                                   metrics["lang2_loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics = self.model.get_metrics(reset=True)
        metrics["loss"] = float(
            train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["cm_loss"] = float(
            train_loss_cm /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["lang1_loss"] = float(
            train_loss_lang1 /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["lang2_loss"] = float(
            train_loss_lang2 /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #21

파일 보기

    def train(self, recover: bool = False) -> Dict[str, Any]:

        # 1 train sentiment classifier & private classifier & domain embeddings => init G 50 epoch
        # 2 fix share encoder(+domain embeddings?), train share classifier(cls&real/fake) & others => train D
        # 3 fix share classifier, train share encoder, reverse share classifier input gradient  min loss => train G
        training_start_time = time.time()

        if recover:
            try:
                n_epoch, should_stop = self._restore_checkpoint()
                logger.info("Loaded model from checkpoint. Starting at epoch {}", n_epoch)
            except RuntimeError:
                raise ConfigurationError(
                    "Could not recover training from the checkpoint.  Did you mean to output to "
                    "a different serialization directory or delete the existing serialization "
                    "directory?"
                )
        else:
            n_epoch, should_stop = 0, False

            ### Store all the necessary informations and attributes about the tasks ###
            task_infos = {task._name: {} for task in self._task_list}
            for task_idx, task in enumerate(self._task_list):
                task_info = task_infos[task._name]

                # Store statistiscs on training and validation batches
                data_iterator = task._data_iterator
                n_tr_batches = data_iterator.get_num_batches(task._train_data)
                n_val_batches = data_iterator.get_num_batches(task._validation_data)
                task_info["n_tr_batches"] = n_tr_batches
                task_info["n_val_batches"] = n_val_batches

                # Create counter for number of batches trained during the whole
                # training for this specific tasks
                task_info["total_n_batches_trained"] = 0

                task_info["last_log"] = time.time()  # Time of last logging
            self._task_infos = task_infos

            ### Bookkeeping the validation metrics ###
            metric_infos = {
                task._name: {
                    "val_metric": task._val_metric,
                    "hist": [],
                    "is_out_of_patience": False,
                    "min_lr_hit": False,
                    "best": (-1, {}),
                }
                for task in self._task_list
            }
            self._metric_infos = metric_infos

        ### Write log ###
        total_n_tr_batches = 0  # The total number of training batches across all the datasets.
        for task_name, info in self._task_infos.items():
            total_n_tr_batches += info["n_tr_batches"]
            logger.info("Task {}:", task_name)
            logger.info("\t{} training batches", info["n_tr_batches"])
            logger.info("\t{} validation batches", info["n_val_batches"])

        ### Create the training generators/iterators tqdm ###
        self._tr_generators = {}
        for task in self._task_list:
            data_iterator = task._data_iterator
            tr_generator = data_iterator(task._train_data, num_epochs=None)
            self._tr_generators[task._name] = tr_generator

        ### Create sampling probability distribution ###
        if self._sampling_method == "uniform":
            sampling_prob = [float(1 / self._n_tasks)] * self._n_tasks
        elif self._sampling_method == "proportional":
            sampling_prob = [float(info["n_tr_batches"] / total_n_tr_batches) for info in self._task_infos.values()]

        ### Enable gradient clipping ###
        # Only if self._grad_clipping is specified
        self._enable_gradient_clipping()

        ### Setup is ready. Training of the model can begin ###
        logger.info("Set up ready. Beginning training/validation.")

        avg_accuracies = []
        best_accuracy = 0.0

        ### Begin Training of the model ###
        while not should_stop:
            ### Log Infos: current epoch count and CPU/GPU usage ###
            logger.info("")
            logger.info("Epoch {}/{} - Begin", n_epoch, self._num_epochs - 1)
            logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
            for gpu, memory in gpu_memory_mb().items():
                logger.info(f"GPU {gpu} memory usage MB: {memory}")

            # if n_epoch <= 10:
            #     # init generator
            #     all_tr_metrics = self._train_epoch(total_n_tr_batches, sampling_prob)
            # # train discriminator 3 epochs
            # # elif 10 < n_epoch < 20 or n_epoch % 2 == 0:
            # #     all_tr_metrics = self._train_epoch(total_n_tr_batches, sampling_prob, train_D=True)
            # else:
            # train adversarial generator every 3 epoch
            all_tr_metrics = self._train_epoch(total_n_tr_batches, sampling_prob, reverse=True)

            all_val_metrics, avg_accuracy = self._validation(n_epoch)
            is_best = False
            if best_accuracy < avg_accuracy:
                best_accuracy = avg_accuracy
                logger.info("Best accuracy found --- {}", best_accuracy / self._n_tasks)
                is_best = True

            ### Print all training and validation metrics for this epoch ###
            logger.info("***** Epoch {}/{} Statistics *****", n_epoch, self._num_epochs - 1)
            for task in self._task_list:
                logger.info("Statistic: {}", task._name)
                logger.info(
                    "\tTraining - {}: {:3d}",
                    "Nb batches trained",
                    self._task_infos[task._name]["n_batches_trained_this_epoch"],
                )
                for metric_name, value in all_tr_metrics[task._name].items():
                    logger.info("\tTraining - {}: {:.3f}", metric_name, value)
                for metric_name, value in all_val_metrics[task._name].items():
                    logger.info("\tValidation - {}: {:.3f}", metric_name, value)
            logger.info("***** Average accuracy is {:.6f} *****", avg_accuracy / self._n_tasks)
            avg_accuracies.append(avg_accuracy / self._n_tasks)
            logger.info("**********")

            ### Check to see if should stop ###
            stop_tr, stop_val = True, True

            for task in self._task_list:
                # task_info = self._task_infos[tasks._name]
                if self._optimizers[task._name]['exclude_share_encoder'].param_groups[0]["lr"] < self._min_lr and \
                        self._optimizers[task._name]['exclude_share_discriminator'].param_groups[0][
                            "lr"] < self._min_lr:
                    logger.info("Minimum lr hit on {}.", task._name)
                    logger.info("Task {} vote to stop training.", task._name)
                    metric_infos[task._name]["min_lr_hit"] = True
                stop_tr = stop_tr and self._metric_infos[task._name]["min_lr_hit"]
                stop_val = stop_val and self._metric_infos[task._name]["is_out_of_patience"]

            if stop_tr:
                should_stop = True
                logger.info("All tasks hit minimum lr. Stopping training.")
            if stop_val:
                should_stop = True
                logger.info("All metrics ran out of patience. Stopping training.")
            if n_epoch >= self._num_epochs - 1:
                should_stop = True
                logger.info("Maximum number of epoch hit. Stopping training.")

            self._save_checkpoint(n_epoch, should_stop, is_best)

            ### Update n_epoch ###
            # One epoch = doing N (forward + backward) pass where N is the total number of training batches.
            n_epoch += 1
            self._epoch_trained = n_epoch

        logger.info("Max accuracy is {:.6f}", max(avg_accuracies))

        ### Summarize training at the end ###
        logger.info("***** Training is finished *****")
        logger.info("Stopped training after {} epochs", n_epoch)
        return_metrics = {}
        for task_name, task_info in self._task_infos.items():
            nb_epoch_trained = int(task_info["total_n_batches_trained"] / task_info["n_tr_batches"])
            logger.info(
                "Trained {} for {} batches ~= {} epochs",
                task_name,
                task_info["total_n_batches_trained"],
                nb_epoch_trained,
            )
            return_metrics[task_name] = {
                "best_epoch": self._metric_infos[task_name]["best"][0],
                "nb_epoch_trained": nb_epoch_trained,
                "best_epoch_val_metrics": self._metric_infos[task_name]["best"][1],
            }

        training_elapsed_time = time.time() - training_start_time
        return_metrics["training_duration"] = time.strftime("%d:%H:%M:%S", time.gmtime(training_elapsed_time))
        return_metrics["nb_epoch_trained"] = n_epoch

        return return_metrics

예제 #22

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        out_of_memory_count = 0
        # Set the model to "train" mode.
        self.model.train()

        # Get tqdm for the training batches
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(
                self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        num_training_batches = len(self.train_loader)

        #TODO: other options for process bar
        #TODO: subset
        train_generator_tqdm = Tqdm.tqdm(self.train_loader,
                                         total=num_training_batches)
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                batch_num_total % self._histogram_interval == 0)

            self.optimizer.zero_grad()

            try:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                if self.fp16:
                    self.optimizer.backward(loss)
                else:
                    loss.backward()
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    torch.cuda.empty_cache()
                    out_of_memory_count += 1
                    if out_of_memory_count > int(num_training_batches * 0.01):
                        raise e
                else:
                    raise e

            train_loss += loss.item()
            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            #if self._learning_rate_scheduler:
            #    self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7), batch_num_total)
            else:
                self.optimizer.step()

            if self.ema is not None:
                for name, param in self.model.named_parameters():
                    if param.requires_grad:
                        param.data = self.ema(name, param.data)

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            #TODO: other options for process bar
            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                if self._should_log_parameter_statistics:
                    self._parameter_and_gradient_statistics_to_tensorboard(
                        batch_num_total, batch_grad_norm)
                if self._should_log_learning_rate:
                    self._learning_rates_to_tensorboard(batch_num_total)
                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"],
                                                   batch_num_total)
                self._metrics_to_tensorboard(
                    batch_num_total,
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

                if self.predictor is not None:
                    with torch.no_grad():
                        val_metrics = self.predictor.evaluate(self.model)

                        self._metrics_to_tensorboard(
                            batch_num_total, {
                                "interval_metrics/" + k: v
                                for k, v in val_metrics.items()
                            })

                        this_interval_val_metric = val_metrics[
                            self._validation_metric]

                        is_best_so_far = self._is_best_so_far(
                            this_interval_val_metric,
                            self._validation_metric_per_interval)
                        self._validation_metric_per_interval.append(
                            this_interval_val_metric)
                        if is_best_so_far:
                            self._save_checkpoint(
                                '{0}.{1}'.format(epoch, batch_num_total),
                                self._validation_metric_per_interval,
                                is_best=True)

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total,
                                                histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, time_to_str(int(last_save_time))), [],
                                      is_best=False)

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)

예제 #23

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #24

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            images = []
            text = []
            segment_ids = []
            labels = []
            for i in range(len(batch_group[0]['images'])):
                positive_index = random.randint(0, self.num_negative_samples)
                labels.append(positive_index)
                if self.retrieve_text:
                    instance_text = []
                    instance_segment_ids = []
                    for j in range(self.num_negative_samples + 1):
                        if j == positive_index:
                            instance_text.append(batch_group[0]['token_ids']
                                                 ['tokens'][i, :].tolist())
                            instance_segment_ids.append(
                                batch_group[0]['segment_ids'][i].tolist())
                        else:
                            negative_sample_index = random.choice(
                                self.train_indices)
                            text_field = TextField(
                                self.train_text_db[negative_sample_index],
                                self.train_token_indexers)
                            text_field.index(self.model.vocab)
                            padding_lengths = text_field.get_padding_lengths()
                            instance_text.append(
                                text_field.as_tensor(
                                    padding_lengths=padding_lengths)
                                ['tokens'].tolist())
                            instance_segment_ids.append(
                                self.train_segment_ids_db[
                                    negative_sample_index].tolist())
                    text += instance_text
                    segment_ids += instance_segment_ids
                else:
                    instance_images = [
                        None for _ in range(self.num_negative_samples + 1)
                    ]
                    for j in range(self.num_negative_samples + 1):
                        if j == positive_index:
                            instance_images[j] = np.expand_dims(
                                batch_group[0]['images'][i].numpy(), 0)
                        else:
                            instance_images[j] = np.expand_dims(
                                random.choice(self.train_image_db), 0)
                    images += instance_images
            matching_label_field_name = "labels"
            if self.retrieve_text:
                max_text_len = max([len(sequence) for sequence in text])
                text = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in text
                ]
                batch_group[0]['token_ids'] = {
                    'tokens': torch.LongTensor(text)
                }
                segment_ids = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in segment_ids
                ]
                batch_group[0]['segment_ids'] = torch.from_numpy(
                    np.array(segment_ids, dtype=np.int64))
            else:
                batch_group[0]['images'] = torch.from_numpy(np.vstack(images))
            batch_group[0][matching_label_field_name] = torch.from_numpy(
                np.array(labels, dtype=np.int64))
            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics

예제 #25

파일 보기

파일: sampler_multi_task_trainer.py 프로젝트: yueyedeai/hmtl

    def train(self, recover: bool = False):
        """
        Train the different task_list, save the different checkpoints and metrics,
        and save the model at the end of training while logging the training details.
        
        The metrics through the training are stored in dictionaries with the following structure:
        
        all_metrics - Dict[str, str]
            task_name: val_metric

        metric_infos (Dict[])
            task_name (Dict[str, diverse]
                val_metric (str): name (str)
                hist (str): history_of_the_val_metric (List[float])
                stopped (str): training_is_stopped (bool)
                best (str): best_epoch_for_val_metric (Tuple(int, Dict))  

        all_tr_metrics (Dict[str, Dict[str, float]])
            task_name (Dict[str, float])
                metric_name (str): value (float)
                loss: value (float)		

        all_val_metrics (Dict[str, Dict[str, float]])
            task_name (Dict[str, float])
                metric_name (str): value (float)
                loss (str): value (float)
        
        Parameters
        ----------
        task_list: List[Task], required
            A list containing the tasks to train.
        params: Params, required
            Training parameters
        recover: bool, required
            Whether or not training should be recovered from a previous training.

        Returns
        -------
        return_dict: Dict
            A dictionary summarizing the training and the metrics for the best epochs for each task.
        """
        training_start_time = time.time()

        if recover:
            try:
                n_epoch, should_stop = self._restore_checkpoint()
                logger.info(
                    "Loaded model from checkpoint. Starting at epoch %d",
                    n_epoch)
            except RuntimeError:
                raise ConfigurationError(
                    "Could not recover training from the checkpoint.  Did you mean to output to "
                    "a different serialization directory or delete the existing serialization "
                    "directory?")
        else:
            n_epoch, should_stop = 0, False

            ### Store all the necessary informations and attributes about the tasks ###
            task_infos = {task._name: {} for task in self._task_list}
            for task_idx, task in enumerate(self._task_list):
                task_info = task_infos[task._name]

                # Store statistiscs on training and validation batches
                data_iterator = task._data_iterator
                n_tr_batches = data_iterator.get_num_batches(task._train_data)
                n_val_batches = data_iterator.get_num_batches(
                    task._validation_data)
                task_info["n_tr_batches"] = n_tr_batches
                task_info["n_val_batches"] = n_val_batches

                # Create counter for number of batches trained during the whole
                # training for this specific task
                task_info["total_n_batches_trained"] = 0

                task_info["last_log"] = time.time()  # Time of last logging
            self._task_infos = task_infos

            ### Bookkeeping the validation metrics ###
            metric_infos = {
                task._name: {
                    "val_metric": task._val_metric,
                    "hist": [],
                    "is_out_of_patience": False,
                    "min_lr_hit": False,
                    "best": (-1, {}),
                }
                for task in self._task_list
            }
            self._metric_infos = metric_infos

        ### Write log ###
        total_n_tr_batches = 0  # The total number of training batches across all the datasets.
        for task_name, info in self._task_infos.items():
            total_n_tr_batches += info["n_tr_batches"]
            logger.info("Task %s:", task_name)
            logger.info("\t%d training batches", info["n_tr_batches"])
            logger.info("\t%d validation batches", info["n_val_batches"])

        ### Create the training generators/iterators tqdm ###
        self._tr_generators = {}
        for task in self._task_list:
            data_iterator = task._data_iterator
            tr_generator = data_iterator(task._train_data, num_epochs=None)
            self._tr_generators[task._name] = tr_generator

        ### Create sampling probability distribution ###
        if self._sampling_method == "uniform":
            sampling_prob = [float(1 / self._n_tasks)] * self._n_tasks
        elif self._sampling_method == "proportional":
            sampling_prob = [
                float(info["n_tr_batches"] / total_n_tr_batches)
                for info in self._task_infos.values()
            ]

        ### Enable gradient clipping ###
        # Only if self._grad_clipping is specified
        self._enable_gradient_clipping()

        ### Setup is ready. Training of the model can begin ###
        logger.info("Set up ready. Beginning training/validation.")

        ### Begin Training of the model ###
        while not should_stop:
            # Train one epoch (training pass + validation pass)

            self._model.train()  # Set the model to "train" mode.

            ### Log Infos: current epoch count and CPU/GPU usage ###
            logger.info("")
            logger.info("Epoch %d/%d - Begin", n_epoch, self._num_epochs - 1)
            logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
            for gpu, memory in gpu_memory_mb().items():
                logger.info(f"GPU {gpu} memory usage MB: {memory}")

            logger.info("Training - Begin")

            ### Reset training and trained batches counter before new training epoch ###
            for _, task_info in self._task_infos.items():
                task_info["tr_loss_cum"] = 0.0
                task_info["n_batches_trained_this_epoch"] = 0
            all_tr_metrics = {
            }  # BUG TO COMPLETE COMMENT TO MAKE IT MORE CLEAR

            ### Start training epoch ###
            epoch_tqdm = tqdm.tqdm(range(total_n_tr_batches),
                                   total=total_n_tr_batches)
            for _ in epoch_tqdm:
                task_idx = np.argmax(np.random.multinomial(1, sampling_prob))
                task = self._task_list[task_idx]
                task_info = self._task_infos[task._name]

                ### One forward + backward pass ###

                # Call next batch to train
                batch = next(self._tr_generators[task._name])
                task_info["n_batches_trained_this_epoch"] += 1

                # Load optimizer
                optimizer = self._optimizers[task._name]
                optimizer.zero_grad()

                # Get the loss for this batch
                output_dict = self._forward(tensor_batch=batch,
                                            task=task,
                                            for_training=True)
                assert "loss" in output_dict, "Model must return a dict containing a 'loss' key"
                loss = output_dict["loss"]
                loss.backward()
                task_info["tr_loss_cum"] += loss.item()

                # Gradient rescaling if self._grad_norm is specified
                self._rescale_gradients()

                # Take an optimization step
                optimizer.step()

                ### Get metrics for all progress so far, update tqdm, display description ###
                task_metrics = self._get_metrics(task=task)
                task_metrics["loss"] = float(
                    task_info["tr_loss_cum"] /
                    (task_info["n_batches_trained_this_epoch"] + 0.000_001))
                description = self._description_from_metrics(task_metrics)
                epoch_tqdm.set_description(task._name + ", " + description)

                ### Tensorboard logging: Training detailled metrics, parameters and gradients ###
                if self._global_step % self._summary_interval == 0:
                    # Metrics
                    for metric_name, value in task_metrics.items():
                        self._tensorboard.add_train_scalar(
                            name="training_details/" + task._name + "/" +
                            metric_name,
                            value=value,
                            global_step=self._global_step,
                        )
                    # Parameters and Gradients
                    for param_name, param in self._model.named_parameters():
                        if self._log_parameter_statistics:
                            self._tensorboard.add_train_scalar(
                                name="parameter_mean/" + param_name,
                                value=param.data.mean(),
                                global_step=self._global_step,
                            )
                            self._tensorboard.add_train_scalar(
                                name="parameter_std/" + param_name,
                                value=param.data.std(),
                                global_step=self._global_step,
                            )
                        if param.grad is None:
                            continue
                        if self._log_gradient_statistics:
                            self._tensorboard.add_train_scalar(
                                name="grad_mean/" + param_name,
                                value=param.grad.data.mean(),
                                global_step=self._global_step,
                            )
                            self._tensorboard.add_train_scalar(
                                name="grad_std/" + param_name,
                                value=param.grad.data.std(),
                                global_step=self._global_step,
                            )
                self._global_step += 1

            ### Bookkeeping all the training metrics for all the tasks on the training epoch that just finished ###
            for task in self._task_list:
                task_info = self._task_infos[task._name]

                task_info["total_n_batches_trained"] += task_info[
                    "n_batches_trained_this_epoch"]
                task_info["last_log"] = time.time()

                task_metrics = self._get_metrics(task=task, reset=True)
                if task._name not in all_tr_metrics:
                    all_tr_metrics[task._name] = {}
                for name, value in task_metrics.items():
                    all_tr_metrics[task._name][name] = value
                all_tr_metrics[task._name]["loss"] = float(
                    task_info["tr_loss_cum"] /
                    (task_info["n_batches_trained_this_epoch"] + 0.000_000_01))

                # Tensorboard - Training metrics for this epoch
                self._tensorboard.add_train_scalar(
                    name="training_proportions/" + task._name,
                    value=task_info["n_batches_trained_this_epoch"],
                    global_step=n_epoch,
                )
                for metric_name, value in all_tr_metrics[task._name].items():
                    self._tensorboard.add_train_scalar(
                        name="task_" + task._name + "/" + metric_name,
                        value=value,
                        global_step=n_epoch)

            logger.info("Train - End")

            ### Begin validation of the model ###
            logger.info("Validation - Begin")
            all_val_metrics = {}

            self._model.eval()  # Set the model into evaluation mode

            for task_idx, task in enumerate(self._task_list):
                logger.info("Validation - Task %d/%d: %s", task_idx + 1,
                            self._n_tasks, task._name)

                val_loss = 0.0
                n_batches_val_this_epoch_this_task = 0
                n_val_batches = self._task_infos[task._name]["n_val_batches"]
                scheduler = self._schedulers[task._name]

                # Create tqdm generator for current task's validation
                data_iterator = task._data_iterator
                val_generator = data_iterator(task._validation_data,
                                              num_epochs=1,
                                              shuffle=False)
                val_generator_tqdm = tqdm.tqdm(val_generator,
                                               total=n_val_batches)

                # Iterate over each validation batch for this task
                for batch in val_generator_tqdm:
                    n_batches_val_this_epoch_this_task += 1

                    # Get the loss
                    val_output_dict = self._forward(batch,
                                                    task=task,
                                                    for_training=False)
                    loss = val_output_dict["loss"]
                    val_loss += loss.item()

                    # Get metrics for all progress so far, update tqdm, display description
                    task_metrics = self._get_metrics(task=task)
                    task_metrics["loss"] = float(
                        val_loss / n_batches_val_this_epoch_this_task)
                    description = self._description_from_metrics(task_metrics)
                    val_generator_tqdm.set_description(description)

                # Get task validation metrics and store them in all_val_metrics
                task_metrics = self._get_metrics(task=task, reset=True)
                if task._name not in all_val_metrics:
                    all_val_metrics[task._name] = {}
                for name, value in task_metrics.items():
                    all_val_metrics[task._name][name] = value
                all_val_metrics[task._name]["loss"] = float(
                    val_loss / n_batches_val_this_epoch_this_task)

                # Tensorboard - Validation metrics for this epoch
                for metric_name, value in all_val_metrics[task._name].items():
                    self._tensorboard.add_validation_scalar(
                        name="task_" + task._name + "/" + metric_name,
                        value=value,
                        global_step=n_epoch)

                ### Perform a patience check and update the history of validation metric for this task ###
                this_epoch_val_metric = all_val_metrics[task._name][
                    task._val_metric]
                metric_history = self._metric_infos[task._name]["hist"]

                metric_history.append(this_epoch_val_metric)
                is_best_so_far, out_of_patience = self._check_history(
                    metric_history=metric_history,
                    cur_score=this_epoch_val_metric,
                    should_decrease=task._val_metric_decreases,
                )

                if is_best_so_far:
                    logger.info("Best model found for %s.", task._name)
                    self._metric_infos[task._name]["best"] = (n_epoch,
                                                              all_val_metrics)
                if out_of_patience and not self._metric_infos[
                        task._name]["is_out_of_patience"]:
                    self._metric_infos[task._name]["is_out_of_patience"] = True
                    logger.info(
                        "Task %s is out of patience and vote to stop the training.",
                        task._name)

                # The LRScheduler API is agnostic to whether your schedule requires a validation metric -
                # if it doesn't, the validation metric passed here is ignored.
                scheduler.step(this_epoch_val_metric, n_epoch)

            logger.info("Validation - End")

            ### Print all training and validation metrics for this epoch ###
            logger.info("***** Epoch %d/%d Statistics *****", n_epoch,
                        self._num_epochs - 1)
            for task in self._task_list:
                logger.info("Statistic: %s", task._name)
                logger.info(
                    "\tTraining - %s: %3d",
                    "Nb batches trained",
                    self._task_infos[task._name]
                    ["n_batches_trained_this_epoch"],
                )
                for metric_name, value in all_tr_metrics[task._name].items():
                    logger.info("\tTraining - %s: %3f", metric_name, value)
                for metric_name, value in all_val_metrics[task._name].items():
                    logger.info("\tValidation - %s: %3f", metric_name, value)
            logger.info("**********")

            ### Check to see if should stop ###
            stop_tr, stop_val = True, True

            for task in self._task_list:
                # task_info = self._task_infos[task._name]
                if self._optimizers[
                        task._name].param_groups[0]["lr"] < self._min_lr:
                    logger.info("Minimum lr hit on %s.", task._name)
                    logger.info("Task %s vote to stop training.", task._name)
                    metric_infos[task._name]["min_lr_hit"] = True
                stop_tr = stop_tr and self._metric_infos[
                    task._name]["min_lr_hit"]
                stop_val = stop_val and self._metric_infos[
                    task._name]["is_out_of_patience"]

            if stop_tr:
                should_stop = True
                logging.info("All tasks hit minimum lr. Stopping training.")
            if stop_val:
                should_stop = True
                logging.info(
                    "All metrics ran out of patience. Stopping training.")
            if n_epoch >= self._num_epochs - 1:
                should_stop = True
                logging.info("Maximum number of epoch hit. Stopping training.")

            self._save_checkpoint(n_epoch, should_stop)

            ### Update n_epoch ###
            # One epoch = doing N (forward + backward) pass where N is the total number of training batches.
            n_epoch += 1

        ### Summarize training at the end ###
        logging.info("***** Training is finished *****")
        logging.info("Stopped training after %d epochs", n_epoch)
        return_metrics = {}
        for task_name, task_info in self._task_infos.items():
            nb_epoch_trained = int(task_info["total_n_batches_trained"] /
                                   task_info["n_tr_batches"])
            logging.info(
                "Trained %s for %d batches ~= %d epochs",
                task_name,
                task_info["total_n_batches_trained"],
                nb_epoch_trained,
            )
            return_metrics[task_name] = {
                "best_epoch": self._metric_infos[task_name]["best"][0],
                "nb_epoch_trained": nb_epoch_trained,
                "best_epoch_val_metrics":
                self._metric_infos[task_name]["best"][1],
            }

        training_elapsed_time = time.time() - training_start_time
        return_metrics["training_duration"] = time.strftime(
            "%d:%H:%M:%S", time.gmtime(training_elapsed_time))
        return_metrics["nb_epoch_trained"] = n_epoch

        return return_metrics

예제 #26

파일 보기

파일: trainer.py 프로젝트: pyknife/allennlp

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self._model.named_parameters()}
                self._optimizer.step()
                for name, param in self._model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)

예제 #27

파일 보기

파일: trainer.py 프로젝트: mstei4176/allennlp

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #28

파일 보기

파일: deepspeed_trainer.py 프로젝트: jacobdanovitch/allennlp_deepspeed

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_memory_mb().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage MB: {memory}")
        gpu_memory_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0

        if regularization_penalty is not None:
            train_reg_loss = 0.0
            batch_reg_loss = 0.0
        else:
            train_reg_loss = None
            batch_reg_loss = None
        # Set the model to "train" mode.
        self.model_engine.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps)
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        batch_group_generator_tqdm = batch_group_generator
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch,
                                                       for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs.get("loss")
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss = loss.item()
                    train_loss += batch_loss
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss

                self.model_engine.backward(loss)
                self.model_engine.step()

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }

                if self._scaler is not None:
                    self._scaler.step(self.optimizer)
                    self._scaler.update()
                else:
                    self.optimizer.step()

                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                if self._scaler is not None:
                    self._scaler.step(self.optimizer)
                    self._scaler.update()
                else:
                    self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if self._master:
                # Updating tqdm only for the master as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(
                    self.model,
                    self.optimizer,
                    0.,  # batch_grad_norm,
                    metrics,
                    batch_group,
                    param_updates,
                )

                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)

            for callback in self._batch_callbacks:
                callback(
                    self,
                    batch_group,
                    batch_group_outputs,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_master=self._master,
                )

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics

예제 #29

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())


        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            self.model.train()
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self.model.named_parameters()}
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model, self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"])
                self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model, histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size/batches_this_epoch
                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size", average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time)))
                )
            if self._early_stopping_by_batch and self._batch_num_total % 10 == 0:
                if self._validation_data is not None:
                    with torch.no_grad():
                        # We have a validation set, so compute all the metrics on it.
                        val_loss, num_batches = self._validation_loss()
                        val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True)

                        # Check validation metric for early stopping
                        this_epoch_val_metric = val_metrics[self._validation_metric]
                        self._metric_tracker.add_metric(this_epoch_val_metric)

                        if self._metric_tracker.is_best_so_far():
                            metrics['best_batch'] = self._batch_num_total
                            for key, value in val_metrics.items():
                                metrics["best_validation_" + key] = value
                            self._metric_tracker.best_epoch_metrics = val_metrics

                        self._save_checkpoint(self._batch_num_total)

                        if self.callbacks is not None:
                            for callback in self.callbacks:
                                callback.on_batch_end(self._batch_num_total)

        metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory
        return metrics

예제 #30

파일 보기

    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(
            raw_train_generator,
            num_gpus * self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            (num_gpus * self._num_gradient_accumulation_steps))
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            if not self._graph_added and self._require_graph:
                model_copy = deepcopy(self.model)
                model_copy.log_graph()
                wrapped_model = ModelWrapper(model_copy)
                graph_inputs = wrapped_model.process_inputs(batch_group[0])
                # print(deepcopy(wrapped_model)(graph_inputs))
                self._tensorboard.add_graph(wrapped_model, [graph_inputs])
                self._graph_added = True

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            num_batch = len(batch_group) // num_gpus
            for i in range(num_batch):
                if (i + 1) * num_gpus > len(batch_group):
                    batch_i = batch_group[i * num_gpus:]
                else:
                    batch_i = batch_group[i * num_gpus:(i + 1) * num_gpus]

                loss = self.batch_loss(batch_i, for_training=True)
                if loss is None or torch.isnan(loss):
                    print("nan loss")
                    continue
                    # raise ValueError("nan loss encountered")
                loss = loss / num_batch
                # try:
                #     loss.backward()
                # except Exception:
                #     print("loss: ", loss)
                #     print(batch_group)
                # with torch.autograd.set_detect_anomaly(True):  This can potentially lead to slower training
                #     loss.backward()

                # loss = loss.half()
                loss.backward()
                # with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                #    try:
                #        scaled_loss.backward()
                #    except RuntimeError:
                #        print("CUDA out of memory during backward()")
                #        continue

                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics