Example #1
0
 def __init__(self, model, optimizer, scheduler, criterion, evalate, device,
              config):
     self.model = model.to(device)
     self.optimizer = optimizer
     self.scheduler = scheduler
     self.criterion = criterion
     self.scaler = amp.GradScaler()
     self.device = device
     self.config = config
     self.evalate = evalate
     self.t_losses = AverageMeter()
     self.v_losses = AverageMeter()
     self.writer = SummaryWriter(log_dir="./logs/" + config.f_name)
     self.iter = 0
Example #2
0
    def __init__(self, logger_args, start_epoch, num_epochs, batch_size, dataset_len, device, normalization='imagenet'):
        super(TrainLogger, self).__init__(logger_args, start_epoch, num_epochs, batch_size, dataset_len,
            device, is_training=True, normalization=normalization)

        assert logger_args.iters_per_print % batch_size == 0, "iters_per_print must be divisible by batch_size"
        assert logger_args.iters_per_visual % batch_size == 0, "iters_per_visual must be divisible by batch_size"

        self.iters_per_print = logger_args.iters_per_print
        self.iters_per_visual = logger_args.iters_per_visual
        self.experiment_name = logger_args.name
        self.max_eval = logger_args.max_eval
        self.num_epochs = num_epochs
        self.loss_meter = AverageMeter()
        self.w_loss_meter = AverageMeter()
Example #3
0
    def __init__(self,
                 parameters,
                 optim_args,
                 batch_size,
                 iters_per_print,
                 iters_per_visual,
                 iters_per_eval,
                 dataset_len,
                 logger=None):

        self.optimizer = optim_args.optimizer
        self.lr = optim_args.lr
        self.lr_scheduler_name = optim_args.lr_scheduler
        self.sgd_momentum = optim_args.sgd_momentum
        self.weight_decay = optim_args.weight_decay
        self.sgd_dampening = optim_args.sgd_dampening
        self.lr_step = 0
        self.lr_decay_step = optim_args.lr_decay_step
        self.lr_patience = optim_args.lr_patience
        self.num_epochs = optim_args.num_epochs
        self.epoch = optim_args.start_epoch
        self.batch_size = batch_size
        self.dataset_len = dataset_len
        self.loss_meter = AverageMeter()

        # Current iteration in epoch
        # (i.e., # examples seen in the current epoch)
        self.iter = 0
        # Current iteration overall (i.e., total # of examples seen)
        self.global_step = round_down((self.epoch - 1) * dataset_len,
                                      batch_size)
        self.iter_start_time = None
        self.epoch_start_time = None

        self.iters_per_print = iters_per_print
        self.iters_per_visual = iters_per_visual
        self.iters_per_eval = iters_per_eval

        self.logger = logger

        self.set_optimizer(parameters)
        self.set_scheduler()
Example #4
0
class Optimizer(object):
    def __init__(self,
                 parameters,
                 optim_args,
                 batch_size,
                 iters_per_print,
                 iters_per_visual,
                 iters_per_eval,
                 dataset_len,
                 logger=None):

        self.optimizer = optim_args.optimizer
        self.lr = optim_args.lr
        self.lr_scheduler_name = optim_args.lr_scheduler
        self.sgd_momentum = optim_args.sgd_momentum
        self.weight_decay = optim_args.weight_decay
        self.sgd_dampening = optim_args.sgd_dampening
        self.lr_step = 0
        self.lr_decay_step = optim_args.lr_decay_step
        self.lr_patience = optim_args.lr_patience
        self.num_epochs = optim_args.num_epochs
        self.epoch = optim_args.start_epoch
        self.batch_size = batch_size
        self.dataset_len = dataset_len
        self.loss_meter = AverageMeter()

        # Current iteration in epoch
        # (i.e., # examples seen in the current epoch)
        self.iter = 0
        # Current iteration overall (i.e., total # of examples seen)
        self.global_step = round_down((self.epoch - 1) * dataset_len,
                                      batch_size)
        self.iter_start_time = None
        self.epoch_start_time = None

        self.iters_per_print = iters_per_print
        self.iters_per_visual = iters_per_visual
        self.iters_per_eval = iters_per_eval

        self.logger = logger

        self.set_optimizer(parameters)
        self.set_scheduler()

    def load_optimizer(self, ckpt_path, gpu_ids):
        """Load optimizer and LR scheduler state from disk.

        Args:
            ckpt_path: Path to checkpoint to load.
            gpu_ids: GPU IDs for loading the state dict.
        """
        device = f'cuda:{gpu_ids[0]}' if len(gpu_ids) > 0 else 'cpu'
        ckpt_dict = torch.load(ckpt_path, map_location=device)
        self.optimizer.load_state_dict(ckpt_dict['optimizer'])
        if self.lr_scheduler is not None:
            self.lr_scheduler.load_state_dict(ckpt_dict['lr_scheduler'])

    def set_optimizer(self, parameters):
        """Set the PyTorch optimizer for params.

        Args:
            parameters: Iterator of network parameters to
                        optimize (i.e., model.parameters()).

        Returns:
            PyTorch optimizer specified by optim_args.
        """
        if self.optimizer == 'sgd':
            self.optimizer = optim.SGD(parameters,
                                       self.lr,
                                       momentum=self.sgd_momentum,
                                       weight_decay=self.weight_decay,
                                       dampening=self.sgd_dampening)
        elif self.optimizer == 'adam':
            self.optimizer = optim.Adam(parameters,
                                        self.lr,
                                        betas=(0.9, 0.999),
                                        weight_decay=self.weight_decay)
        else:
            raise ValueError(f'Unsupported optimizer: {self.optimizer}')

    def set_scheduler(self):
        """Set the PyTorch scheduler which updates the learning rate
        for the optimizer."""
        if self.lr_scheduler_name is None:
            self.lr_scheduler = None
        elif self.lr_scheduler_name == 'step':
            self.lr_scheduler =\
                optim.lr_scheduler.StepLR(self.optimizer,
                                          step_size=self.lr_decay_step,
                                          gamma=self.lr_decay_gamma)
        elif self.lr_scheduler_name == 'multi_step':
            self.lr_scheduler =\
                optim.lr_scheduler.MultiStepLR(self.optimizer,
                                               milestones=self.lr_milestones,
                                               gamma=self.lr_decay_gamma)
        elif self.lr_scheduler_name == 'plateau':
            self.lr_scheduler =\
                optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                     factor=self.lr_decay_gamma,
                                                     patience=self.lr_patience,
                                                     min_lr=[pg['lr'] * 1e-3
                                                             for pg in self.optimizer.param_groups])
        else:
            raise ValueError('Invalid learning rate scheduler: ' +
                             f'{self.lr_scheduler_name}.')

    def step_scheduler(self, metric_value):
        """Step a LR scheduler.

        Args:
            metric_value: Metric value to determine the best checkpoint.
        """
        if self.lr_scheduler is not None:
            self.lr_step += 1

            if isinstance(self.lr_scheduler,
                          optim.lr_scheduler.ReduceLROnPlateau):
                self.lr_scheduler.step(metric_value, epoch=self.lr_step)
            else:
                self.lr_scheduler.step(epoch=self.lr_step)

    def is_finished_training(self):
        """Return True if finished training, otherwise return False."""
        return 0 < self.num_epochs < self.epoch

    def start_epoch(self):
        self.epoch_start_time = time()
        self.iter = 0
        self.logger.log(f'[start of epoch {self.epoch}]')

    def end_epoch(self, metrics, phase='valid'):
        epoch_time = time() - self.epoch_start_time
        lr = self.optimizer.param_groups[0]['lr']
        self.logger.log(f'[end of epoch {self.epoch}, epoch time: ' +
                        f'{epoch_time:.2g}, lr: {lr}]')

        self.logger.log_scalars(metrics,
                                self.global_step,
                                phase=phase,
                                print_to_stdout=True)

        self.epoch += 1

    def start_iter(self):
        """Log info for start of an iteration."""
        self.iter_start_time = time()

    def end_iter(self):
        """Log info for end of an iteration."""
        self.iter += self.batch_size
        self.global_step += self.batch_size

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.optimizer.step()

    def state_dict(self):
        return self.optimizer.state_dict()

    def log_iter(self,
                 inputs,
                 logits,
                 targets,
                 unweighted_loss,
                 phase='train'):
        """Log results from a training iteration."""
        loss = unweighted_loss.item()

        self.loss_meter.update(loss, inputs.size(0))

        # Periodically write to the log and TensorBoard
        if self.iter % self.iters_per_print == 0:

            # Write a header for the log entry
            avg_time = (time() - self.iter_start_time) / self.batch_size
            message = (f'[epoch: {self.epoch}, ' +
                       f'iter: {self.iter} / {self.dataset_len}, ' +
                       f'time: {avg_time:.2f}, ' +
                       f'loss: {self.loss_meter.avg:.3g}]')

            # Write all errors as scalars to the graph
            batch_lr = self.optimizer.param_groups[0]['lr']
            self.logger.log_scalars({'batch_lr': batch_lr},
                                    self.global_step,
                                    phase=phase,
                                    print_to_stdout=False)
            self.logger.log_scalars({'batch_loss': self.loss_meter.avg},
                                    self.global_step,
                                    phase=phase,
                                    print_to_stdout=False)
            self.loss_meter.reset()

            self.logger.log(message)
Example #5
0
class Trainer:
    def __init__(self, model, optimizer, scheduler, criterion, evalate, device,
                 config):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.scaler = amp.GradScaler()
        self.device = device
        self.config = config
        self.evalate = evalate
        self.t_losses = AverageMeter()
        self.v_losses = AverageMeter()
        self.writer = SummaryWriter(log_dir="./logs/" + config.f_name)
        self.iter = 0

    def fit(self, train_dl, val_dl):
        best_eval = 0.0
        for i in range(self.config.n_epochs):
            start = time.time()
            self.train_one_epoch(train_dl)
            self.validation(val_dl)
            #lr = self.optimizer.param_groups[0]['lr']
            metric = self.evalate.get_scores()[0]['mIoU']
            self.log(
                f'[RESULT]: Epoch: {i+1}, train_loss: {self.t_losses.avg:.4f}, val_loss: {self.v_losses.avg:.5f}, mIoU: {metric:.6f}, time: {(time.time() - start):.3f}'
            )
            self.writer.add_scalar('train_loss', round(self.t_losses.avg, 5),
                                   i + 1)
            self.writer.add_scalar('val_loss', round(self.v_losses.avg, 5),
                                   i + 1)
            self.writer.add_scalar('mIoU', round(metric, 6), i + 1)

            if best_eval < metric:
                best_eval = metric
                self.save(epoch=i + 1, mIoU=best_eval)
        self.save(epoch=self.config.n_epochs, mIoU=metric, last=True)
        self.writer.close()

    def train_one_epoch(self, train_dl):
        self.model.train()
        self.t_losses.reset()
        for img, target in train_dl:
            self.optimizer.zero_grad()
            with amp.autocast(enabled=True):
                pred = self.model(img.to(device))
                loss = self.criterion(pred, target.to(device).long())
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.scheduler.step()
            self.t_losses.update(loss.item(), self.config.train_batch_size)

            if (self.iter % 10) == 0:
                print(f'iter : {self.iter}')
            self.iter += 1

    def validation(self, val_dl):
        self.model.eval()
        self.evalate.reset()
        self.v_losses.reset()
        for img, target in val_dl:
            with amp.autocast(enabled=True):
                pred = self.model(img.to(device))
                loss = self.criterion(pred, target.to(device).long())
            pred = pred['out'].cpu()
            pred = torch.argmax(pred.squeeze(0), dim=1,
                                keepdim=True).squeeze(1).numpy()
            target = target.cpu().numpy()
            self.evalate.update(pred=pred, gt=target)
            self.v_losses.update(loss.item(), self.config.val_batch_size)

    def save(self, epoch, mIoU, last=False):
        if last:
            l_or_b = '_last.bin'
        else:
            l_or_b = '_best.bin'
        torch.save(
            {
                'model_state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'scheduler': self.scheduler.state_dict(),
                'epoch': epoch,
                'mIoU': mIoU,
            }, config.weight_path + l_or_b)

    def log(self, message):
        if self.config.verbose:
            print(message)
        with open(config.log_path + '.txt', mode='a') as logger:
            logger.write(f'{message}\n')
Example #6
0
class TrainLogger(BaseLogger):
    """Class for logging training info to the console and saving model parameters to disk."""
    def __init__(self, logger_args, start_epoch, num_epochs, batch_size,
                 dataset_len, device):
        super(TrainLogger, self).__init__(logger_args,
                                          start_epoch,
                                          num_epochs,
                                          batch_size,
                                          dataset_len,
                                          device,
                                          is_training=True)

        assert logger_args.iters_per_print % batch_size == 0, "iters_per_print must be divisible by batch_size"
        assert logger_args.iters_per_visual % batch_size == 0, "iters_per_visual must be divisible by batch_size"

        self.iters_per_print = logger_args.iters_per_print
        self.iters_per_visual = logger_args.iters_per_visual
        self.experiment_name = logger_args.name
        self.max_eval = logger_args.max_eval
        self.num_epochs = num_epochs
        self.loss_meter = AverageMeter()
        self.w_loss_meter = AverageMeter()

    def start_iter(self):
        """Log info for start of an iteration."""
        self.iter_start_time = time()

    def plot_metrics(self, metrics):
        """Plot a dictionary of metrics to TensorBoard."""
        if metrics is not None:
            self._log_scalars(metrics)

    def log_iter(self, inputs, logits, targets, unweighted_loss, weighted_loss,
                 optimizer):
        """Log results from a training iteration."""
        loss = unweighted_loss.item()
        w_loss = weighted_loss.item() if weighted_loss else None

        self.loss_meter.update(loss, inputs.size(0))
        if w_loss is not None:
            self.w_loss_meter.update(w_loss, inputs.size(0))

        # Periodically write to the log and TensorBoard
        if self.iter % self.iters_per_print == 0:

            # Write a header for the log entry
            avg_time = (time() - self.iter_start_time) / self.batch_size
            message = '[epoch: {}, iter: {} / {}, time: {:.2f}, loss: {:.3g}, wloss {:3g}]' \
                .format(self.epoch, self.iter, self.dataset_len, avg_time, self.loss_meter.avg, self.w_loss_meter.avg)

            # Write all errors as scalars to the graph
            self._log_scalars({'batch_lr': optimizer.param_groups[0]['lr']},
                              print_to_stdout=False)
            self._log_scalars({'batch_loss': self.loss_meter.avg},
                              print_to_stdout=False)
            self._log_scalars({'batch_wloss': self.w_loss_meter.avg},
                              print_to_stdout=False)
            self.loss_meter.reset()
            self.w_loss_meter.reset()

            self.write(message)

        # Periodically visualize up to num_visuals training examples from the batch
        if self.iter % self.iters_per_visual == 0:
            self.visualize(inputs, logits, targets, phase='train')

    def end_iter(self):
        """Log info for end of an iteration."""
        self.iter += self.batch_size
        self.global_step += self.batch_size

    def start_epoch(self):
        """Log info for start of an epoch."""
        self.epoch_start_time = time()
        self.iter = 0
        self.write('[start of epoch {}]'.format(self.epoch))

    def end_epoch(self, metrics, optimizer):
        """Log info for end of an epoch.

        Args:
            metrics: Dictionary of metric values. Items have format '{phase}_{metric}': value.
            optimizer: Optimizer for the model.
        """
        self.write('[end of epoch {}, epoch time: {:.2g}, lr: {}]'.format(
            self.epoch,
            time() - self.epoch_start_time, optimizer.param_groups[0]['lr']))
        if metrics is not None:
            self._log_scalars(metrics)

        self.epoch += 1

    def is_finished_training(self):
        """Return True if finished training, otherwise return False."""
        return 0 < self.num_epochs < self.epoch