def val(model, val_loader, criterion, epoch, args, log_writer=False):
    global best_val_acc
    model.eval()
    val_loss = lib.Metric('val_loss')
    val_accuracy = lib.Metric('val_accuracy')

    if epoch == -1:
        epoch = args.epochs - 1

    with tqdm(total=len(val_loader),
              desc='Validate Epoch #{}'.format(epoch + 1)) as t:
        with torch.no_grad():
            for data, target in val_loader:
                if args.cuda:
                    data, target = data.cuda(), target.cuda()
                output = model(data)

                val_loss.update(criterion(output, target))
                val_accuracy.update(accuracy(output, target))
                t.update(1)

    print("\nloss: {}, accuracy: {:.2f}, best acc: {:.2f}\n".format(val_loss.avg.item(), 100. * val_accuracy.avg.item(),
                                                                    100. * max(best_val_acc, val_accuracy.avg)))

    if val_accuracy.avg > best_val_acc and log_writer:
        save_model(model, None, -1, args)

    if log_writer:
        log_writer.add_scalar('val/loss', val_loss.avg, epoch)
        log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch)
        best_val_acc = max(best_val_acc, val_accuracy.avg)
        log_writer.add_scalar('val/best_acc', best_val_acc, epoch)
def train(model, train_loader, optimizer, criterion, epoch, log_writer, args):
    train_loss = lib.Metric('train_loss')
    train_accuracy = lib.Metric('train_accuracy')
    model.train()
    N = len(train_loader)
    start_time = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):
        lr_cur = adjust_learning_rate(args, optimizer, epoch, batch_idx, N, type=args.lr_scheduler)
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        train_loss.update(loss)
        train_accuracy.update(accuracy(output, target))
        if (batch_idx + 1) % 20 == 0:
            memory = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
            used_time = time.time() - start_time
            eta = used_time / (batch_idx + 1) * (N - batch_idx)
            eta = str(datetime.timedelta(seconds=int(eta)))
            training_state = '  '.join(['Epoch: {}', '[{} / {}]', 'eta: {}', 'lr: {:.9f}', 'max_mem: {:.0f}',
                                        'loss: {:.3f}', 'accuracy: {:.3f}'])
            training_state = training_state.format(epoch + 1, batch_idx + 1, N, eta, lr_cur, memory,
                                                   train_loss.avg.item(), 100. * train_accuracy.avg.item())
            print(training_state)

    if log_writer:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
Beispiel #3
0
def val(model,
        val_loader,
        val_sampler,
        criterion,
        epoch,
        args,
        log_writer=False,
        verbose=False):
    global best_val_acc
    model.eval()
    val_loss = lib.Metric('val_loss')
    val_accuracy = lib.Metric('val_accuracy')

    if epoch == -1:
        epoch = args.epochs

    if args.distributed:
        val_sampler.set_epoch(epoch)

    with tqdm(total=len(val_loader),
              desc='Validate Epoch #{}'.format(epoch + 1)) as t:
        with torch.no_grad():
            for data, target in val_loader:
                if args.cuda:
                    data, target = data.cuda(args.gpu,
                                             non_blocking=True), target.cuda(
                                                 args.gpu, non_blocking=True)
                output = model(data)

                loss = criterion(output, target)
                dist.all_reduce(loss)
                pred = output.max(1, keepdim=True)[1]
                acc = pred.eq(target.view_as(pred)).float().mean()
                dist.all_reduce(acc)

                val_loss.update(loss * 1.0 / args.ngpus_per_node)
                val_accuracy.update(acc * 1.0 / args.ngpus_per_node)
                t.update(1)

    if verbose:
        print("\nloss: {}, accuracy: {:.2f}, best acc: {:.2f}\n".format(
            val_loss.avg.item(), 100. * val_accuracy.avg.item(),
            100. * max(best_val_acc, val_accuracy.avg)))

    if val_accuracy.avg > best_val_acc and log_writer:
        dist_save_model(model, None, -1, args.ngpus_per_node, args)

    if verbose:
        if log_writer:
            log_writer.add_scalar('val/loss', val_loss.avg, epoch)
            log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch)
            best_val_acc = max(best_val_acc, val_accuracy.avg)
            log_writer.add_scalar('val/best_acc', best_val_acc, epoch)
Beispiel #4
0
def train(model, train_sampler, train_loader, optimizer, criterion, epoch,
          log_writer, args, verbose):
    train_loss = lib.Metric('train_loss')
    train_accuracy = lib.Metric('train_accuracy')
    model.train()
    if args.distributed:
        train_sampler.set_epoch(epoch)
    N = len(train_loader)
    start_time = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):
        lr_cur = adjust_learning_rate(args,
                                      optimizer,
                                      epoch,
                                      batch_idx,
                                      N,
                                      type=args.lr_scheduler)
        if args.cuda:
            data, target = data.cuda(args.gpu, non_blocking=True), target.cuda(
                args.gpu, non_blocking=True)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        dist.all_reduce(loss)
        pred = output.max(1, keepdim=True)[1]
        acc = pred.eq(target.view_as(pred)).float().mean()
        dist.all_reduce(acc)
        train_loss.update(loss * 1.0 / args.ngpus_per_node)
        train_accuracy.update(acc.cpu() * 1.0 / args.ngpus_per_node)
        if (batch_idx + 1) % 20 == 0 and verbose:
            memory = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
            used_time = time.time() - start_time
            eta = used_time / (batch_idx + 1) * (N - batch_idx)
            eta = str(datetime.timedelta(seconds=int(eta)))
            training_state = '  '.join([
                'Epoch: {}', '[{} / {}]', 'eta: {}', 'lr: {:.9f}',
                'max_mem: {:.0f}', 'loss: {:.3f}', 'accuracy: {:.3f}'
            ])
            training_state = training_state.format(
                epoch + 1, batch_idx + 1, N, eta, lr_cur, memory,
                train_loss.avg.item(), 100. * train_accuracy.avg.item())
            print(training_state)

    if log_writer and verbose:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)