def val(model, val_loader, criterion, epoch, args, log_writer=False): global best_val_acc model.eval() val_loss = lib.Metric('val_loss') val_accuracy = lib.Metric('val_accuracy') if epoch == -1: epoch = args.epochs - 1 with tqdm(total=len(val_loader), desc='Validate Epoch #{}'.format(epoch + 1)) as t: with torch.no_grad(): for data, target in val_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) val_loss.update(criterion(output, target)) val_accuracy.update(accuracy(output, target)) t.update(1) print("\nloss: {}, accuracy: {:.2f}, best acc: {:.2f}\n".format(val_loss.avg.item(), 100. * val_accuracy.avg.item(), 100. * max(best_val_acc, val_accuracy.avg))) if val_accuracy.avg > best_val_acc and log_writer: save_model(model, None, -1, args) if log_writer: log_writer.add_scalar('val/loss', val_loss.avg, epoch) log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch) best_val_acc = max(best_val_acc, val_accuracy.avg) log_writer.add_scalar('val/best_acc', best_val_acc, epoch)
def train(model, train_loader, optimizer, criterion, epoch, log_writer, args): train_loss = lib.Metric('train_loss') train_accuracy = lib.Metric('train_accuracy') model.train() N = len(train_loader) start_time = time.time() for batch_idx, (data, target) in enumerate(train_loader): lr_cur = adjust_learning_rate(args, optimizer, epoch, batch_idx, N, type=args.lr_scheduler) if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() train_loss.update(loss) train_accuracy.update(accuracy(output, target)) if (batch_idx + 1) % 20 == 0: memory = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 used_time = time.time() - start_time eta = used_time / (batch_idx + 1) * (N - batch_idx) eta = str(datetime.timedelta(seconds=int(eta))) training_state = ' '.join(['Epoch: {}', '[{} / {}]', 'eta: {}', 'lr: {:.9f}', 'max_mem: {:.0f}', 'loss: {:.3f}', 'accuracy: {:.3f}']) training_state = training_state.format(epoch + 1, batch_idx + 1, N, eta, lr_cur, memory, train_loss.avg.item(), 100. * train_accuracy.avg.item()) print(training_state) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
def val(model, val_loader, val_sampler, criterion, epoch, args, log_writer=False, verbose=False): global best_val_acc model.eval() val_loss = lib.Metric('val_loss') val_accuracy = lib.Metric('val_accuracy') if epoch == -1: epoch = args.epochs if args.distributed: val_sampler.set_epoch(epoch) with tqdm(total=len(val_loader), desc='Validate Epoch #{}'.format(epoch + 1)) as t: with torch.no_grad(): for data, target in val_loader: if args.cuda: data, target = data.cuda(args.gpu, non_blocking=True), target.cuda( args.gpu, non_blocking=True) output = model(data) loss = criterion(output, target) dist.all_reduce(loss) pred = output.max(1, keepdim=True)[1] acc = pred.eq(target.view_as(pred)).float().mean() dist.all_reduce(acc) val_loss.update(loss * 1.0 / args.ngpus_per_node) val_accuracy.update(acc * 1.0 / args.ngpus_per_node) t.update(1) if verbose: print("\nloss: {}, accuracy: {:.2f}, best acc: {:.2f}\n".format( val_loss.avg.item(), 100. * val_accuracy.avg.item(), 100. * max(best_val_acc, val_accuracy.avg))) if val_accuracy.avg > best_val_acc and log_writer: dist_save_model(model, None, -1, args.ngpus_per_node, args) if verbose: if log_writer: log_writer.add_scalar('val/loss', val_loss.avg, epoch) log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch) best_val_acc = max(best_val_acc, val_accuracy.avg) log_writer.add_scalar('val/best_acc', best_val_acc, epoch)
def train(model, train_sampler, train_loader, optimizer, criterion, epoch, log_writer, args, verbose): train_loss = lib.Metric('train_loss') train_accuracy = lib.Metric('train_accuracy') model.train() if args.distributed: train_sampler.set_epoch(epoch) N = len(train_loader) start_time = time.time() for batch_idx, (data, target) in enumerate(train_loader): lr_cur = adjust_learning_rate(args, optimizer, epoch, batch_idx, N, type=args.lr_scheduler) if args.cuda: data, target = data.cuda(args.gpu, non_blocking=True), target.cuda( args.gpu, non_blocking=True) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() dist.all_reduce(loss) pred = output.max(1, keepdim=True)[1] acc = pred.eq(target.view_as(pred)).float().mean() dist.all_reduce(acc) train_loss.update(loss * 1.0 / args.ngpus_per_node) train_accuracy.update(acc.cpu() * 1.0 / args.ngpus_per_node) if (batch_idx + 1) % 20 == 0 and verbose: memory = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 used_time = time.time() - start_time eta = used_time / (batch_idx + 1) * (N - batch_idx) eta = str(datetime.timedelta(seconds=int(eta))) training_state = ' '.join([ 'Epoch: {}', '[{} / {}]', 'eta: {}', 'lr: {:.9f}', 'max_mem: {:.0f}', 'loss: {:.3f}', 'accuracy: {:.3f}' ]) training_state = training_state.format( epoch + 1, batch_idx + 1, N, eta, lr_cur, memory, train_loss.avg.item(), 100. * train_accuracy.avg.item()) print(training_state) if log_writer and verbose: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)