def validate(val_loader, model, criterion): """ Evaluate model using criterion on validation set """ losses = Meter(ptag='Loss') acc = Meter(ptag='Accuracy') # top1 = Meter(ptag='Prec@1') # top5 = Meter(ptag='Prec@5') # switch to evaluate mode model.eval() with torch.no_grad(): for i, (features, target) in enumerate(val_loader): target = target.cuda(non_blocking=True) # create one-hot vector from target kl_target = torch.zeros(target.shape[0], 1000, device='cuda').scatter_( 1, target.view(-1, 1), 1) # compute output output = model(features) loss = criterion(output, kl_target) # measure accuracy and record loss # prec1, prec5 = accuracy(output, target, topk=()) acc_val = accuracy(output, target) losses.update(loss.item(), features.size(0)) acc.update(acc_val, features.size(0)) # top1.update(prec1.item(), features.size(0)) # top5.update(prec5.item(), features.size(0)) log.info(' * Accuracy {acc.avg:.3f}'.format(acc=acc)) return acc.avg
def validate(val_loader, model, criterion): """ Evaluate model using criterion on validation set """ losses = Meter(ptag='Loss') top1 = Meter(ptag='Prec@1') top5 = Meter(ptag='Prec@5') # switch to evaluate mode model.eval() model.disable_gossip() with torch.no_grad(): for i, (features, target) in enumerate(val_loader): target = target.cuda(non_blocking=True) # compute output output = model(features) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), features.size(0)) top1.update(prec1.item(), features.size(0)) top5.update(prec5.item(), features.size(0)) log.info(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) return top1.avg
def validate(val_loader, model, criterion, log): """ Evaluate model using criterion on validation set """ losses = Meter(ptag='Loss') top1 = Meter(ptag='Prec@1') top5 = Meter(ptag='Prec@5') # switch to evaluate mode model.eval() with torch.no_grad(): for i, (features, target) in enumerate(val_loader): # if args.fp16: # features = features.cuda(non_blocking=True).half() # This is not needed but let it be since there is no harm target = target.cuda(non_blocking=True) # create one-hot vector from target kl_target = torch.zeros( target.shape[0], 1000, device='cuda').scatter_( 1, target.view(-1, 1), 1) # compute output output = model(features) loss = criterion(output, kl_target) # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), features.size(0)) top1.update(prec1.item(), features.size(0)) top5.update(prec5.item(), features.size(0)) # log.info(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' # .format(top1=top1, top5=top5)) log.info(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {losses.avg:.3f}' .format(top1=top1, top5=top5, losses = losses)) # print('pp * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {losses.avg:.3f}' # .format(top1=top1, top5=top5, losses = losses)) return losses.avg, top1.avg, top5.avg
def train(model, criterion, optimizer, batch_meter, data_meter, nn_meter, loader, epoch, itr, begin_time, num_itr_ignore): losses = Meter(ptag='Loss') acc = Meter(ptag="Accuracy") # top1 = Meter(ptag='Prec@1') # top5 = Meter(ptag='Prec@5') # switch to train mode model.train() # spoof sampler to continue from checkpoint w/o loading data all over again _train_loader = loader.__iter__() for i in range(itr): try: next(_train_loader.sample_iter) except Exception: # finished epoch but prempted before state was updated log.info('Loader spoof error attempt {}/{}'.format(i, len(loader))) return log.debug('Training (epoch {})'.format(epoch)) batch_time = time.time() for i, (batch, target) in enumerate(_train_loader, start=itr): target = target.cuda(non_blocking=True) # create one-hot vector from target # kl_target = torch.zeros(target.shape[0], 1000, device='cuda').scatter_( # 1, target.view(-1, 1), 1) if num_itr_ignore == 0: data_meter.update(time.time() - batch_time) # ----------------------------------------------------------- # # Forward/Backward pass # ----------------------------------------------------------- # nn_time = time.time() output = model(batch) loss = criterion(output, target) loss.backward() if i % 100 == 0: update_learning_rate(optimizer, epoch, itr=i, itr_per_epoch=len(loader)) optimizer.step() # optimization update optimizer.zero_grad() if not args.overlap and not args.all_reduce: log.debug('Transferring params') model.transfer_params() if num_itr_ignore == 0: nn_meter.update(time.time() - nn_time) # ----------------------------------------------------------- # if num_itr_ignore == 0: batch_meter.update(time.time() - batch_time) batch_time = time.time() log_time = time.time() # measure accuracy and record loss acc_val = accuracy(output, target) losses.update(loss.item(), batch.size(0)) acc.update(acc_val, batch.size(0)) # top1.update(prec1.item(), batch.size(0)) # top5.update(prec5.item(), batch.size(0)) if i % args.print_freq == 0: with open(args.out_fname, '+a') as f: print('{ep},{itr},{bt},{nt},{dt},' '{loss.val:.4f},{loss.avg:.4f},' '{acc.val:.3f},{acc.avg:.3f},' '-1'.format(ep=epoch, itr=i, bt=batch_meter, dt=data_meter, nt=nn_meter, loss=losses, acc=acc), file=f) if num_itr_ignore > 0: num_itr_ignore -= 1 log_time = time.time() - log_time log.debug(log_time) if (args.num_iterations_per_training_epoch != -1 and i + 1 == args.num_iterations_per_training_epoch): break with open(args.out_fname, '+a') as f: print('{ep},{itr},{bt},{nt},{dt},' '{loss.val:.4f},{loss.avg:.4f},' '{acc.val:.3f},{acc.avg:.3f},' '-1'.format(ep=epoch, itr=i, bt=batch_meter, dt=data_meter, nt=nn_meter, loss=losses, acc=acc), file=f)
def train(model, criterion, optimizer, batch_meter, data_meter, nn_meter, loader, epoch, itr, begin_time): losses = Meter(ptag='Loss') top1 = Meter(ptag='Prec@1') top5 = Meter(ptag='Prec@5') # switch to train mode model.train() # spoof sampler to continue from checkpoint w/o loading data all over again _train_loader = loader.__iter__() for i in range(itr): try: next(_train_loader.sample_iter) except Exception: # finished epoch but prempted before state was updated log.info('Loader spoof error attempt {}/{}'.format(i, len(loader))) return log.debug('Training (epoch {})'.format(epoch)) model.enable_gossip() batch_time = time.time() for i, (batch, target) in enumerate(_train_loader, start=itr): target = target.cuda(non_blocking=True) data_meter.update(time.time() - batch_time) # ----------------------------------------------------------- # # Forward/Backward pass # ----------------------------------------------------------- # nn_time = time.time() output = model(batch) loss = criterion(output, target) bilat_freq = 100 if i == 0: update_global_iteration_counter(itr=1, itr_per_epoch=len(loader)) update_bilat_learning_rate(model, itr_per_epoch=len(loader)) elif (i + args.rank) % (bilat_freq) == 0: update_global_iteration_counter(itr=bilat_freq, itr_per_epoch=len(loader)) update_bilat_learning_rate(model, itr_per_epoch=len(loader)) loss.backward() update_learning_rate(optimizer, epoch, itr=i, itr_per_epoch=len(loader)) optimizer.step() # optimization update optimizer.zero_grad() nn_meter.update(time.time() - nn_time) # ----------------------------------------------------------- # batch_meter.update(time.time() - batch_time) batch_time = time.time() log_time = time.time() # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), batch.size(0)) top1.update(prec1.item(), batch.size(0)) top5.update(prec5.item(), batch.size(0)) if i % args.print_freq == 0: ep = args.global_epoch itr = args.global_itr % (len(loader) * args.world_size) with open(args.out_fname, '+a') as f: print('{ep},{itr},{bt},{nt},{dt},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},' '{top5.val:.3f},{top5.avg:.3f},-1'.format(ep=ep, itr=itr, bt=batch_meter, dt=data_meter, nt=nn_meter, loss=losses, top1=top1, top5=top5), file=f) log_time = time.time() - log_time log.debug(log_time) with open(args.out_fname, '+a') as f: print('{ep},{itr},{bt},{nt},{dt},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},' '{top5.val:.3f},{top5.avg:.3f},-1'.format(ep=epoch, itr=i, bt=batch_meter, dt=data_meter, nt=nn_meter, loss=losses, top1=top1, top5=top5), file=f)
def train(config, model, criterion, optimizer, batch_meter, data_meter, nn_meter, loader, epoch, itr, begin_time, num_itr_ignore, log): losses = Meter(ptag='Loss') top1 = Meter(ptag='Prec@1') top5 = Meter(ptag='Prec@5') # switch to train mode model.train() # spoof sampler to continue from checkpoint w/o loading data all over again _train_loader = loader.__iter__() for i in range(itr): try: next(_train_loader.sample_iter) except Exception: # finished epoch but prempted before state was updated log.info('Loader spoof error attempt {}/{}'.format(i, len(loader))) return log.debug('Training (epoch {})'.format(epoch)) batch_time = time.time() for i, (batch, target) in enumerate(_train_loader, start=itr): # if args.fp16: # batch = batch.cuda(non_blocking=True).half() target = target.cuda(non_blocking=True) # create one-hot vector from target kl_target = torch.zeros(target.shape[0], 1000, device='cuda').scatter_( 1, target.view(-1, 1), 1) if num_itr_ignore == 0: data_meter.update(time.time() - batch_time) # ----------------------------------------------------------- # # Forward/Backward pass # ----------------------------------------------------------- # nn_time = time.time() output = model(batch) loss = criterion(output, kl_target) # if args.fp16: # if args.amp: # with amp_handle.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # else: # optimizer.backward(loss) # else: # loss.backward() loss.backward() if i % 100 == 0: update_learning_rate(config, optimizer, epoch, log, itr=i, itr_per_epoch=len(loader)) optimizer.step() # optimization update optimizer.zero_grad() if not config['overlap'] and not config['all_reduce']: log.debug('Transferring params') model.transfer_params() if num_itr_ignore == 0: nn_meter.update(time.time() - nn_time) # ----------------------------------------------------------- # if num_itr_ignore == 0: batch_meter.update(time.time() - batch_time) batch_time = time.time() log_time = time.time() # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), batch.size(0)) top1.update(prec1.item(), batch.size(0)) top5.update(prec5.item(), batch.size(0)) if i % config['print_freq'] == 0: with open(config['out_fname'], '+a') as f: print('{ep},{itr},{bt},{nt},{dt},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},' '{top5.val:.3f},{top5.avg:.3f},-1' .format(ep=epoch, itr=i, bt=batch_meter, dt=data_meter, nt=nn_meter, loss=losses, top1=top1, top5=top5), file=f) if num_itr_ignore > 0: num_itr_ignore -= 1 log_time = time.time() - log_time log.debug(log_time) if (config['num_iterations_per_training_epoch'] != -1 and i+1 == config['num_iterations_per_training_epoch']): break with open(config['out_fname'], '+a') as f: print('{ep},{itr},{bt},{nt},{dt},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},' '{top5.val:.3f},{top5.avg:.3f},-1' .format(ep=epoch, itr=i, bt=batch_meter, dt=data_meter, nt=nn_meter, loss=losses, top1=top1, top5=top5), file=f) return losses.avg, top1.avg, top5.avg