Example #1
0
def validate(val_loader, model, criterion):
    """ Evaluate model using criterion on validation set """

    losses = Meter(ptag='Loss')
    acc = Meter(ptag='Accuracy')
    # top1 = Meter(ptag='Prec@1')
    # top5 = Meter(ptag='Prec@5')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        for i, (features, target) in enumerate(val_loader):
            target = target.cuda(non_blocking=True)
            # create one-hot vector from target
            kl_target = torch.zeros(target.shape[0], 1000,
                                    device='cuda').scatter_(
                                        1, target.view(-1, 1), 1)

            # compute output
            output = model(features)
            loss = criterion(output, kl_target)

            # measure accuracy and record loss
            # prec1, prec5 = accuracy(output, target, topk=())
            acc_val = accuracy(output, target)
            losses.update(loss.item(), features.size(0))
            acc.update(acc_val, features.size(0))
            # top1.update(prec1.item(), features.size(0))
            # top5.update(prec5.item(), features.size(0))

        log.info(' * Accuracy {acc.avg:.3f}'.format(acc=acc))

    return acc.avg
Example #2
0
def validate(val_loader, model, criterion):
    """ Evaluate model using criterion on validation set """

    losses = Meter(ptag='Loss')
    top1 = Meter(ptag='Prec@1')
    top5 = Meter(ptag='Prec@5')

    # switch to evaluate mode
    model.eval()

    model.disable_gossip()

    with torch.no_grad():
        for i, (features, target) in enumerate(val_loader):

            target = target.cuda(non_blocking=True)

            # compute output
            output = model(features)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), features.size(0))
            top1.update(prec1.item(), features.size(0))
            top5.update(prec5.item(), features.size(0))

        log.info(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(
            top1=top1, top5=top5))

    return top1.avg
Example #3
0
def validate(val_loader, model, criterion, log):
    """ Evaluate model using criterion on validation set """

    losses = Meter(ptag='Loss')
    top1 = Meter(ptag='Prec@1')
    top5 = Meter(ptag='Prec@5')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        for i, (features, target) in enumerate(val_loader):

            # if args.fp16:
            #     features = features.cuda(non_blocking=True).half()
                # This is not needed but let it be since there is no harm

            target = target.cuda(non_blocking=True)
            # create one-hot vector from target
            kl_target = torch.zeros(
                target.shape[0], 1000, device='cuda').scatter_(
                    1, target.view(-1, 1), 1)

            # compute output
            output = model(features)
            loss = criterion(output, kl_target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), features.size(0))
            top1.update(prec1.item(), features.size(0))
            top5.update(prec5.item(), features.size(0))

        # log.info(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
        #          .format(top1=top1, top5=top5))
        log.info(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {losses.avg:.3f}'
                 .format(top1=top1, top5=top5, losses = losses))

        # print('pp * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {losses.avg:.3f}'
        #          .format(top1=top1, top5=top5, losses = losses))

    return losses.avg, top1.avg, top5.avg
Example #4
0
def train(model, criterion, optimizer, batch_meter, data_meter, nn_meter,
          loader, epoch, itr, begin_time, num_itr_ignore):

    losses = Meter(ptag='Loss')
    acc = Meter(ptag="Accuracy")

    # top1 = Meter(ptag='Prec@1')
    # top5 = Meter(ptag='Prec@5')

    # switch to train mode
    model.train()

    # spoof sampler to continue from checkpoint w/o loading data all over again
    _train_loader = loader.__iter__()
    for i in range(itr):
        try:
            next(_train_loader.sample_iter)
        except Exception:
            # finished epoch but prempted before state was updated
            log.info('Loader spoof error attempt {}/{}'.format(i, len(loader)))
            return

    log.debug('Training (epoch {})'.format(epoch))

    batch_time = time.time()
    for i, (batch, target) in enumerate(_train_loader, start=itr):
        target = target.cuda(non_blocking=True)

        # create one-hot vector from target
        # kl_target = torch.zeros(target.shape[0], 1000, device='cuda').scatter_(
        #     1, target.view(-1, 1), 1)

        if num_itr_ignore == 0:
            data_meter.update(time.time() - batch_time)

        # ----------------------------------------------------------- #
        # Forward/Backward pass
        # ----------------------------------------------------------- #
        nn_time = time.time()
        output = model(batch)
        loss = criterion(output, target)
        loss.backward()

        if i % 100 == 0:
            update_learning_rate(optimizer,
                                 epoch,
                                 itr=i,
                                 itr_per_epoch=len(loader))
        optimizer.step()  # optimization update
        optimizer.zero_grad()
        if not args.overlap and not args.all_reduce:
            log.debug('Transferring params')
            model.transfer_params()
        if num_itr_ignore == 0:
            nn_meter.update(time.time() - nn_time)
        # ----------------------------------------------------------- #

        if num_itr_ignore == 0:
            batch_meter.update(time.time() - batch_time)
        batch_time = time.time()

        log_time = time.time()
        # measure accuracy and record loss
        acc_val = accuracy(output, target)

        losses.update(loss.item(), batch.size(0))
        acc.update(acc_val, batch.size(0))

        # top1.update(prec1.item(), batch.size(0))
        # top5.update(prec5.item(), batch.size(0))
        if i % args.print_freq == 0:
            with open(args.out_fname, '+a') as f:
                print('{ep},{itr},{bt},{nt},{dt},'
                      '{loss.val:.4f},{loss.avg:.4f},'
                      '{acc.val:.3f},{acc.avg:.3f},'
                      '-1'.format(ep=epoch,
                                  itr=i,
                                  bt=batch_meter,
                                  dt=data_meter,
                                  nt=nn_meter,
                                  loss=losses,
                                  acc=acc),
                      file=f)
        if num_itr_ignore > 0:
            num_itr_ignore -= 1
        log_time = time.time() - log_time
        log.debug(log_time)

        if (args.num_iterations_per_training_epoch != -1
                and i + 1 == args.num_iterations_per_training_epoch):
            break

    with open(args.out_fname, '+a') as f:
        print('{ep},{itr},{bt},{nt},{dt},'
              '{loss.val:.4f},{loss.avg:.4f},'
              '{acc.val:.3f},{acc.avg:.3f},'
              '-1'.format(ep=epoch,
                          itr=i,
                          bt=batch_meter,
                          dt=data_meter,
                          nt=nn_meter,
                          loss=losses,
                          acc=acc),
              file=f)
Example #5
0
def train(model, criterion, optimizer, batch_meter, data_meter, nn_meter,
          loader, epoch, itr, begin_time):

    losses = Meter(ptag='Loss')
    top1 = Meter(ptag='Prec@1')
    top5 = Meter(ptag='Prec@5')

    # switch to train mode
    model.train()

    # spoof sampler to continue from checkpoint w/o loading data all over again
    _train_loader = loader.__iter__()
    for i in range(itr):
        try:
            next(_train_loader.sample_iter)
        except Exception:
            # finished epoch but prempted before state was updated
            log.info('Loader spoof error attempt {}/{}'.format(i, len(loader)))
            return

    log.debug('Training (epoch {})'.format(epoch))

    model.enable_gossip()

    batch_time = time.time()
    for i, (batch, target) in enumerate(_train_loader, start=itr):

        target = target.cuda(non_blocking=True)
        data_meter.update(time.time() - batch_time)

        # ----------------------------------------------------------- #
        # Forward/Backward pass
        # ----------------------------------------------------------- #
        nn_time = time.time()
        output = model(batch)
        loss = criterion(output, target)

        bilat_freq = 100
        if i == 0:
            update_global_iteration_counter(itr=1, itr_per_epoch=len(loader))
            update_bilat_learning_rate(model, itr_per_epoch=len(loader))
        elif (i + args.rank) % (bilat_freq) == 0:
            update_global_iteration_counter(itr=bilat_freq,
                                            itr_per_epoch=len(loader))
            update_bilat_learning_rate(model, itr_per_epoch=len(loader))

        loss.backward()
        update_learning_rate(optimizer,
                             epoch,
                             itr=i,
                             itr_per_epoch=len(loader))
        optimizer.step()  # optimization update
        optimizer.zero_grad()
        nn_meter.update(time.time() - nn_time)
        # ----------------------------------------------------------- #

        batch_meter.update(time.time() - batch_time)
        batch_time = time.time()

        log_time = time.time()
        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), batch.size(0))
        top1.update(prec1.item(), batch.size(0))
        top5.update(prec5.item(), batch.size(0))
        if i % args.print_freq == 0:
            ep = args.global_epoch
            itr = args.global_itr % (len(loader) * args.world_size)
            with open(args.out_fname, '+a') as f:
                print('{ep},{itr},{bt},{nt},{dt},'
                      '{loss.val:.4f},{loss.avg:.4f},'
                      '{top1.val:.3f},{top1.avg:.3f},'
                      '{top5.val:.3f},{top5.avg:.3f},-1'.format(ep=ep,
                                                                itr=itr,
                                                                bt=batch_meter,
                                                                dt=data_meter,
                                                                nt=nn_meter,
                                                                loss=losses,
                                                                top1=top1,
                                                                top5=top5),
                      file=f)
        log_time = time.time() - log_time
        log.debug(log_time)

    with open(args.out_fname, '+a') as f:
        print('{ep},{itr},{bt},{nt},{dt},'
              '{loss.val:.4f},{loss.avg:.4f},'
              '{top1.val:.3f},{top1.avg:.3f},'
              '{top5.val:.3f},{top5.avg:.3f},-1'.format(ep=epoch,
                                                        itr=i,
                                                        bt=batch_meter,
                                                        dt=data_meter,
                                                        nt=nn_meter,
                                                        loss=losses,
                                                        top1=top1,
                                                        top5=top5),
              file=f)
Example #6
0
def train(config, model, criterion, optimizer, batch_meter, data_meter, nn_meter,
          loader, epoch, itr, begin_time, num_itr_ignore, log):

    losses = Meter(ptag='Loss')
    top1 = Meter(ptag='Prec@1')
    top5 = Meter(ptag='Prec@5')

    # switch to train mode
    model.train()

    # spoof sampler to continue from checkpoint w/o loading data all over again
    _train_loader = loader.__iter__()
    for i in range(itr):
        try:
            next(_train_loader.sample_iter)
        except Exception:
            # finished epoch but prempted before state was updated
            log.info('Loader spoof error attempt {}/{}'.format(i, len(loader)))
            return

    log.debug('Training (epoch {})'.format(epoch))

    batch_time = time.time()
    for i, (batch, target) in enumerate(_train_loader, start=itr):
        # if args.fp16:
        #     batch = batch.cuda(non_blocking=True).half()

        target = target.cuda(non_blocking=True)
        # create one-hot vector from target
        kl_target = torch.zeros(target.shape[0], 1000, device='cuda').scatter_(
            1, target.view(-1, 1), 1)

        if num_itr_ignore == 0:
            data_meter.update(time.time() - batch_time)

        # ----------------------------------------------------------- #
        # Forward/Backward pass
        # ----------------------------------------------------------- #
        nn_time = time.time()
        output = model(batch)
        loss = criterion(output, kl_target)

        # if args.fp16:
        #     if args.amp:
        #         with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
        #             scaled_loss.backward()
        #     else:
        #         optimizer.backward(loss)
        # else:
        #     loss.backward()

        loss.backward()

        if i % 100 == 0:
            update_learning_rate(config, optimizer, epoch, log, itr=i,
                                 itr_per_epoch=len(loader))
        optimizer.step()  # optimization update
        optimizer.zero_grad()
        if not config['overlap'] and not config['all_reduce']:
            log.debug('Transferring params')
            model.transfer_params()
        if num_itr_ignore == 0:
            nn_meter.update(time.time() - nn_time)
        # ----------------------------------------------------------- #

        if num_itr_ignore == 0:
            batch_meter.update(time.time() - batch_time)
        batch_time = time.time()

        log_time = time.time()
        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), batch.size(0))
        top1.update(prec1.item(), batch.size(0))
        top5.update(prec5.item(), batch.size(0))
        if i % config['print_freq'] == 0:
            with open(config['out_fname'], '+a') as f:
                print('{ep},{itr},{bt},{nt},{dt},'
                      '{loss.val:.4f},{loss.avg:.4f},'
                      '{top1.val:.3f},{top1.avg:.3f},'
                      '{top5.val:.3f},{top5.avg:.3f},-1'
                      .format(ep=epoch, itr=i,
                              bt=batch_meter,
                              dt=data_meter, nt=nn_meter,
                              loss=losses, top1=top1,
                              top5=top5), file=f)
        if num_itr_ignore > 0:
            num_itr_ignore -= 1
        log_time = time.time() - log_time
        log.debug(log_time)

        if (config['num_iterations_per_training_epoch'] != -1 and
                i+1 == config['num_iterations_per_training_epoch']):
            break

    with open(config['out_fname'], '+a') as f:
        print('{ep},{itr},{bt},{nt},{dt},'
              '{loss.val:.4f},{loss.avg:.4f},'
              '{top1.val:.3f},{top1.avg:.3f},'
              '{top5.val:.3f},{top5.avg:.3f},-1'
              .format(ep=epoch, itr=i,
                      bt=batch_meter,
                      dt=data_meter, nt=nn_meter,
                      loss=losses, top1=top1,
                      top5=top5), file=f)

    return losses.avg, top1.avg, top5.avg