def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_dataloader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}/{}]".format(epoch, configs.num_epochs)) num_iters_per_epoch = len(train_dataloader) # switch to train mode model.train() start_time = time.time() for batch_idx, batch_data in enumerate(tqdm(train_dataloader)): data_time.update(time.time() - start_time) _, imgs, targets = batch_data global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1 batch_size = imgs.size(0) targets = targets.to(configs.device, non_blocking=True) imgs = imgs.to(configs.device, non_blocking=True) total_loss, outputs = model(imgs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # compute gradient and perform backpropagation total_loss.backward() if global_step % configs.subdivisions == 0: optimizer.step() # Adjust learning rate lr_scheduler.step() # zero the parameter gradients optimizer.zero_grad() if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) # measure elapsed time # torch.cuda.synchronize() batch_time.update(time.time() - start_time) if tb_writer is not None: if (global_step % configs.tensorboard_freq) == 0: tensorboard_log = get_tensorboard_log(model) tensorboard_log['lr'] = lr_scheduler.get_lr()[0] * configs.batch_size * configs.subdivisions tensorboard_log['avg_loss'] = losses.avg tb_writer.add_scalars('Train', tensorboard_log, global_step) # Log message if logger is not None: if (global_step % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time()
def train_one_epoch(train_loader, model, optimizer, epoch, configs, logger): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}/{}]".format( epoch, configs.num_epochs)) # switch to train mode model.train() start_time = time.time() for batch_idx, (resized_imgs, org_ball_pos_xy, global_ball_pos_xy, target_events, target_seg) in enumerate(tqdm(train_loader)): data_time.update(time.time() - start_time) batch_size = resized_imgs.size(0) target_seg = target_seg.to(configs.device, non_blocking=True) resized_imgs = resized_imgs.to(configs.device, non_blocking=True).float() pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model( resized_imgs, org_ball_pos_xy, global_ball_pos_xy, target_events, target_seg) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # zero the parameter gradients optimizer.zero_grad() # compute gradient and perform backpropagation total_loss.backward() optimizer.step() if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) # measure elapsed time torch.cuda.synchronize() batch_time.update(time.time() - start_time) # Log message if logger is not None: if ((batch_idx + 1) % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time() return losses.avg