def train(train_loader, model, optimizer, lr_scheduler, epoch, writer): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() # switch to train mode model.train() world_size = args.world_size rank = args.rank end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) lr_scheduler.update(i, epoch) target = target.cuda(async=True) input_var = torch.autograd.Variable(input.cuda()) target_var = torch.autograd.Variable(target) # compute output loss = model(input_var, target_var, extract_mode=False) / world_size reduced_loss = loss.data.clone() dist.all_reduce_multigpu([reduced_loss]) losses.update(reduced_loss.item(), input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() average_gradients(model) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0 and rank == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) niter = epoch * len(train_loader) + i writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], niter) writer.add_scalar('Train/Avg_Loss', losses.avg, niter)
def train(train_loader, model, criterion, optimizer, lr_scheduler, epoch, writer): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() world_size = args.world_size rank = args.rank end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) lr_scheduler.update(i, epoch) target = target.cuda(async=True) input_var = torch.autograd.Variable(input.cuda()) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) loss = criterion(output, target_var) / world_size # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) reduced_loss = loss.data.clone() reduced_prec1 = prec1.clone() / world_size reduced_prec5 = prec5.clone() / world_size dist.all_reduce_multigpu([reduced_loss]) dist.all_reduce_multigpu([reduced_prec1]) dist.all_reduce_multigpu([reduced_prec5]) losses.update(reduced_loss.item(), input.size(0)) top1.update(reduced_prec1.item(), input.size(0)) top5.update(reduced_prec5.item(), input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() average_gradients(model) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0 and rank == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) niter = epoch * len(train_loader) + i writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], niter) writer.add_scalar('Train/Avg_Loss', losses.avg, niter) writer.add_scalar('Train/Avg_Top1', top1.avg / 100.0, niter) writer.add_scalar('Train/Avg_Top5', top5.avg / 100.0, niter)