def __init__(self, model, optimizer, scheduler, criterion, evalate, device, config): self.model = model.to(device) self.optimizer = optimizer self.scheduler = scheduler self.criterion = criterion self.scaler = amp.GradScaler() self.device = device self.config = config self.evalate = evalate self.t_losses = AverageMeter() self.v_losses = AverageMeter() self.writer = SummaryWriter(log_dir="./logs/" + config.f_name) self.iter = 0
def __init__(self, logger_args, start_epoch, num_epochs, batch_size, dataset_len, device, normalization='imagenet'): super(TrainLogger, self).__init__(logger_args, start_epoch, num_epochs, batch_size, dataset_len, device, is_training=True, normalization=normalization) assert logger_args.iters_per_print % batch_size == 0, "iters_per_print must be divisible by batch_size" assert logger_args.iters_per_visual % batch_size == 0, "iters_per_visual must be divisible by batch_size" self.iters_per_print = logger_args.iters_per_print self.iters_per_visual = logger_args.iters_per_visual self.experiment_name = logger_args.name self.max_eval = logger_args.max_eval self.num_epochs = num_epochs self.loss_meter = AverageMeter() self.w_loss_meter = AverageMeter()
def __init__(self, parameters, optim_args, batch_size, iters_per_print, iters_per_visual, iters_per_eval, dataset_len, logger=None): self.optimizer = optim_args.optimizer self.lr = optim_args.lr self.lr_scheduler_name = optim_args.lr_scheduler self.sgd_momentum = optim_args.sgd_momentum self.weight_decay = optim_args.weight_decay self.sgd_dampening = optim_args.sgd_dampening self.lr_step = 0 self.lr_decay_step = optim_args.lr_decay_step self.lr_patience = optim_args.lr_patience self.num_epochs = optim_args.num_epochs self.epoch = optim_args.start_epoch self.batch_size = batch_size self.dataset_len = dataset_len self.loss_meter = AverageMeter() # Current iteration in epoch # (i.e., # examples seen in the current epoch) self.iter = 0 # Current iteration overall (i.e., total # of examples seen) self.global_step = round_down((self.epoch - 1) * dataset_len, batch_size) self.iter_start_time = None self.epoch_start_time = None self.iters_per_print = iters_per_print self.iters_per_visual = iters_per_visual self.iters_per_eval = iters_per_eval self.logger = logger self.set_optimizer(parameters) self.set_scheduler()
class Optimizer(object): def __init__(self, parameters, optim_args, batch_size, iters_per_print, iters_per_visual, iters_per_eval, dataset_len, logger=None): self.optimizer = optim_args.optimizer self.lr = optim_args.lr self.lr_scheduler_name = optim_args.lr_scheduler self.sgd_momentum = optim_args.sgd_momentum self.weight_decay = optim_args.weight_decay self.sgd_dampening = optim_args.sgd_dampening self.lr_step = 0 self.lr_decay_step = optim_args.lr_decay_step self.lr_patience = optim_args.lr_patience self.num_epochs = optim_args.num_epochs self.epoch = optim_args.start_epoch self.batch_size = batch_size self.dataset_len = dataset_len self.loss_meter = AverageMeter() # Current iteration in epoch # (i.e., # examples seen in the current epoch) self.iter = 0 # Current iteration overall (i.e., total # of examples seen) self.global_step = round_down((self.epoch - 1) * dataset_len, batch_size) self.iter_start_time = None self.epoch_start_time = None self.iters_per_print = iters_per_print self.iters_per_visual = iters_per_visual self.iters_per_eval = iters_per_eval self.logger = logger self.set_optimizer(parameters) self.set_scheduler() def load_optimizer(self, ckpt_path, gpu_ids): """Load optimizer and LR scheduler state from disk. Args: ckpt_path: Path to checkpoint to load. gpu_ids: GPU IDs for loading the state dict. """ device = f'cuda:{gpu_ids[0]}' if len(gpu_ids) > 0 else 'cpu' ckpt_dict = torch.load(ckpt_path, map_location=device) self.optimizer.load_state_dict(ckpt_dict['optimizer']) if self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(ckpt_dict['lr_scheduler']) def set_optimizer(self, parameters): """Set the PyTorch optimizer for params. Args: parameters: Iterator of network parameters to optimize (i.e., model.parameters()). Returns: PyTorch optimizer specified by optim_args. """ if self.optimizer == 'sgd': self.optimizer = optim.SGD(parameters, self.lr, momentum=self.sgd_momentum, weight_decay=self.weight_decay, dampening=self.sgd_dampening) elif self.optimizer == 'adam': self.optimizer = optim.Adam(parameters, self.lr, betas=(0.9, 0.999), weight_decay=self.weight_decay) else: raise ValueError(f'Unsupported optimizer: {self.optimizer}') def set_scheduler(self): """Set the PyTorch scheduler which updates the learning rate for the optimizer.""" if self.lr_scheduler_name is None: self.lr_scheduler = None elif self.lr_scheduler_name == 'step': self.lr_scheduler =\ optim.lr_scheduler.StepLR(self.optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_gamma) elif self.lr_scheduler_name == 'multi_step': self.lr_scheduler =\ optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=self.lr_milestones, gamma=self.lr_decay_gamma) elif self.lr_scheduler_name == 'plateau': self.lr_scheduler =\ optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=self.lr_decay_gamma, patience=self.lr_patience, min_lr=[pg['lr'] * 1e-3 for pg in self.optimizer.param_groups]) else: raise ValueError('Invalid learning rate scheduler: ' + f'{self.lr_scheduler_name}.') def step_scheduler(self, metric_value): """Step a LR scheduler. Args: metric_value: Metric value to determine the best checkpoint. """ if self.lr_scheduler is not None: self.lr_step += 1 if isinstance(self.lr_scheduler, optim.lr_scheduler.ReduceLROnPlateau): self.lr_scheduler.step(metric_value, epoch=self.lr_step) else: self.lr_scheduler.step(epoch=self.lr_step) def is_finished_training(self): """Return True if finished training, otherwise return False.""" return 0 < self.num_epochs < self.epoch def start_epoch(self): self.epoch_start_time = time() self.iter = 0 self.logger.log(f'[start of epoch {self.epoch}]') def end_epoch(self, metrics, phase='valid'): epoch_time = time() - self.epoch_start_time lr = self.optimizer.param_groups[0]['lr'] self.logger.log(f'[end of epoch {self.epoch}, epoch time: ' + f'{epoch_time:.2g}, lr: {lr}]') self.logger.log_scalars(metrics, self.global_step, phase=phase, print_to_stdout=True) self.epoch += 1 def start_iter(self): """Log info for start of an iteration.""" self.iter_start_time = time() def end_iter(self): """Log info for end of an iteration.""" self.iter += self.batch_size self.global_step += self.batch_size def zero_grad(self): self.optimizer.zero_grad() def step(self): self.optimizer.step() def state_dict(self): return self.optimizer.state_dict() def log_iter(self, inputs, logits, targets, unweighted_loss, phase='train'): """Log results from a training iteration.""" loss = unweighted_loss.item() self.loss_meter.update(loss, inputs.size(0)) # Periodically write to the log and TensorBoard if self.iter % self.iters_per_print == 0: # Write a header for the log entry avg_time = (time() - self.iter_start_time) / self.batch_size message = (f'[epoch: {self.epoch}, ' + f'iter: {self.iter} / {self.dataset_len}, ' + f'time: {avg_time:.2f}, ' + f'loss: {self.loss_meter.avg:.3g}]') # Write all errors as scalars to the graph batch_lr = self.optimizer.param_groups[0]['lr'] self.logger.log_scalars({'batch_lr': batch_lr}, self.global_step, phase=phase, print_to_stdout=False) self.logger.log_scalars({'batch_loss': self.loss_meter.avg}, self.global_step, phase=phase, print_to_stdout=False) self.loss_meter.reset() self.logger.log(message)
class Trainer: def __init__(self, model, optimizer, scheduler, criterion, evalate, device, config): self.model = model.to(device) self.optimizer = optimizer self.scheduler = scheduler self.criterion = criterion self.scaler = amp.GradScaler() self.device = device self.config = config self.evalate = evalate self.t_losses = AverageMeter() self.v_losses = AverageMeter() self.writer = SummaryWriter(log_dir="./logs/" + config.f_name) self.iter = 0 def fit(self, train_dl, val_dl): best_eval = 0.0 for i in range(self.config.n_epochs): start = time.time() self.train_one_epoch(train_dl) self.validation(val_dl) #lr = self.optimizer.param_groups[0]['lr'] metric = self.evalate.get_scores()[0]['mIoU'] self.log( f'[RESULT]: Epoch: {i+1}, train_loss: {self.t_losses.avg:.4f}, val_loss: {self.v_losses.avg:.5f}, mIoU: {metric:.6f}, time: {(time.time() - start):.3f}' ) self.writer.add_scalar('train_loss', round(self.t_losses.avg, 5), i + 1) self.writer.add_scalar('val_loss', round(self.v_losses.avg, 5), i + 1) self.writer.add_scalar('mIoU', round(metric, 6), i + 1) if best_eval < metric: best_eval = metric self.save(epoch=i + 1, mIoU=best_eval) self.save(epoch=self.config.n_epochs, mIoU=metric, last=True) self.writer.close() def train_one_epoch(self, train_dl): self.model.train() self.t_losses.reset() for img, target in train_dl: self.optimizer.zero_grad() with amp.autocast(enabled=True): pred = self.model(img.to(device)) loss = self.criterion(pred, target.to(device).long()) self.scaler.scale(loss).backward() self.scaler.step(self.optimizer) self.scaler.update() self.scheduler.step() self.t_losses.update(loss.item(), self.config.train_batch_size) if (self.iter % 10) == 0: print(f'iter : {self.iter}') self.iter += 1 def validation(self, val_dl): self.model.eval() self.evalate.reset() self.v_losses.reset() for img, target in val_dl: with amp.autocast(enabled=True): pred = self.model(img.to(device)) loss = self.criterion(pred, target.to(device).long()) pred = pred['out'].cpu() pred = torch.argmax(pred.squeeze(0), dim=1, keepdim=True).squeeze(1).numpy() target = target.cpu().numpy() self.evalate.update(pred=pred, gt=target) self.v_losses.update(loss.item(), self.config.val_batch_size) def save(self, epoch, mIoU, last=False): if last: l_or_b = '_last.bin' else: l_or_b = '_best.bin' torch.save( { 'model_state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'epoch': epoch, 'mIoU': mIoU, }, config.weight_path + l_or_b) def log(self, message): if self.config.verbose: print(message) with open(config.log_path + '.txt', mode='a') as logger: logger.write(f'{message}\n')
class TrainLogger(BaseLogger): """Class for logging training info to the console and saving model parameters to disk.""" def __init__(self, logger_args, start_epoch, num_epochs, batch_size, dataset_len, device): super(TrainLogger, self).__init__(logger_args, start_epoch, num_epochs, batch_size, dataset_len, device, is_training=True) assert logger_args.iters_per_print % batch_size == 0, "iters_per_print must be divisible by batch_size" assert logger_args.iters_per_visual % batch_size == 0, "iters_per_visual must be divisible by batch_size" self.iters_per_print = logger_args.iters_per_print self.iters_per_visual = logger_args.iters_per_visual self.experiment_name = logger_args.name self.max_eval = logger_args.max_eval self.num_epochs = num_epochs self.loss_meter = AverageMeter() self.w_loss_meter = AverageMeter() def start_iter(self): """Log info for start of an iteration.""" self.iter_start_time = time() def plot_metrics(self, metrics): """Plot a dictionary of metrics to TensorBoard.""" if metrics is not None: self._log_scalars(metrics) def log_iter(self, inputs, logits, targets, unweighted_loss, weighted_loss, optimizer): """Log results from a training iteration.""" loss = unweighted_loss.item() w_loss = weighted_loss.item() if weighted_loss else None self.loss_meter.update(loss, inputs.size(0)) if w_loss is not None: self.w_loss_meter.update(w_loss, inputs.size(0)) # Periodically write to the log and TensorBoard if self.iter % self.iters_per_print == 0: # Write a header for the log entry avg_time = (time() - self.iter_start_time) / self.batch_size message = '[epoch: {}, iter: {} / {}, time: {:.2f}, loss: {:.3g}, wloss {:3g}]' \ .format(self.epoch, self.iter, self.dataset_len, avg_time, self.loss_meter.avg, self.w_loss_meter.avg) # Write all errors as scalars to the graph self._log_scalars({'batch_lr': optimizer.param_groups[0]['lr']}, print_to_stdout=False) self._log_scalars({'batch_loss': self.loss_meter.avg}, print_to_stdout=False) self._log_scalars({'batch_wloss': self.w_loss_meter.avg}, print_to_stdout=False) self.loss_meter.reset() self.w_loss_meter.reset() self.write(message) # Periodically visualize up to num_visuals training examples from the batch if self.iter % self.iters_per_visual == 0: self.visualize(inputs, logits, targets, phase='train') def end_iter(self): """Log info for end of an iteration.""" self.iter += self.batch_size self.global_step += self.batch_size def start_epoch(self): """Log info for start of an epoch.""" self.epoch_start_time = time() self.iter = 0 self.write('[start of epoch {}]'.format(self.epoch)) def end_epoch(self, metrics, optimizer): """Log info for end of an epoch. Args: metrics: Dictionary of metric values. Items have format '{phase}_{metric}': value. optimizer: Optimizer for the model. """ self.write('[end of epoch {}, epoch time: {:.2g}, lr: {}]'.format( self.epoch, time() - self.epoch_start_time, optimizer.param_groups[0]['lr'])) if metrics is not None: self._log_scalars(metrics) self.epoch += 1 def is_finished_training(self): """Return True if finished training, otherwise return False.""" return 0 < self.num_epochs < self.epoch