def validate(loader, model, criterion, optimizer, device, log): model.eval() size_batch, size_data = loader.batch_size, len(loader) running_acc_20, iteration_acc_20, iteration_acc_50 = 0, 0, 0 for index, data in enumerate(loader): inputs = data['frame'].to(device) labels = misc.limit_value_tensor(data['noise_label'] - 976, 0, 999).to(device) real_label = misc.limit_value_tensor(data['steer'] - 976, 0, 999).to(device) optimizer.zero_grad() with torch.set_grad_enabled(False): outputs = model(inputs) _, predicted = torch.max(outputs, 1) loss = criterion(outputs, labels) acc_50 = misc.accuracy(predicted, real_label, size_batch, 20) acc_20 = misc.accuracy(predicted, real_label, size_batch, 50) running_acc_20 += acc_20 iteration_acc_20 += acc_20 iteration_acc_50 += acc_50 if index % 100 == 99: out = 'Iteration: {:>5}/{:<5} {:5} || Acc_20: {:.4f} Acc_50: {:.4f}'.format( index, size_data, 'val', iteration_acc_20 / 100, iteration_acc_50 / 100) print(out) log.write(out) iteration_acc_20, iteration_acc_50 = 0, 0 return running_acc_20 / size_data
def test(): test_losses = misc.AverageMeter() test_top1 = misc.AverageMeter() test_top5 = misc.AverageMeter() model.eval() prefetcher = datasets.DataPrefetcher(test_loader) with torch.no_grad(): data, target = prefetcher.next() while data is not None: default_graph.clear_all_tensors() data, target = data.to(args.device), target.to(args.device) output = model(data) loss = criterion(output, target) prec1, prec5 = misc.accuracy(output, target, topk=(1, 5)) test_losses.update(loss.item(), data.size(0)) test_top1.update(prec1.item(), data.size(0)) test_top5.update(prec5.item(), data.size(0)) data, target = prefetcher.next() test_sparsity = (torch.cat(gates_params) != 0).float().mean().item() print(' * Test set: Loss_CE: %.4f, ' 'Sparsity: %.4f, Top1 acc: %.4f, Top5 acc: %.4f\n' % (test_losses.avg, test_sparsity, test_top1.avg, test_top5.avg)) return test_top1.avg, test_sparsity
def loss_labels(self, outputs, targets, indices, num_segments, log=True): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_segments] """ assert 'pred_logits' in outputs src_logits = outputs['pred_logits'] idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat( [t['labels'][J] for t, (_, J) in zip(targets, indices)]).long() target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) target_classes[idx] = target_classes_o loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {'loss_ce': loss_ce} if log: # TODO this should probably be a separate loss, not hacked in this one here losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] return losses
def validate(val_loader, model, criterion, epoch): losses = misc.AverageMeter() top1 = misc.AverageMeter() top5 = misc.AverageMeter() # switch to evaluate mode prefetcher = datasets.DataPrefetcher(val_loader) model.eval() input, target = prefetcher.next() i = -1 while input is not None: i += 1 with torch.no_grad(): output = model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = misc.accuracy(output.data, target, topk=(1, 5)) reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) input, target = prefetcher.next() print(' * Test Epoch {0}, Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}\n'. format(epoch, top1=top1, top5=top5)) return top1.avg
def train(train_loader, model, criterion, optimizer, use_cuda): # Switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() end = time.time() bar = Bar('Processing', max=len(train_loader)) for batch_idx, (inputs, targets) in enumerate(train_loader): # Measure data loading time data_time.update(time.time() - end) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() targets = targets.squeeze( 1) # pytorch 0.4.0 merged Variable and Tensor # inputs, targets = V(inputs), V(targets.squeeze(1)) # Compute output outputs = model(inputs) loss = criterion(outputs, targets) # Measure accuracy and record loss prec1 = accuracy(outputs.data, targets.data, topk=(1, )) losses.update(loss.item(), inputs.size(0)) top1.update(prec1[0], inputs.size(0)) # Compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Plot progress bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | Top1: {top1:.4f}'.format( batch=batch_idx + 1, size=len(train_loader), data=data_time.val, bt=batch_time.val, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, top1=top1.avg) bar.next() bar.finish() return (losses.avg, top1.avg)
def validation(val_loader, model, criterion, use_cuda): # Switch to evaluate mode model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() end = time.time() bar = Bar('Processing', max=len(val_loader)) for batch_idx, (inputs, targets) in enumerate(val_loader): # Measure data loading time data_time.update(time.time() - end) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # inputs, targets = V(inputs, volatile=True), V(targets.squeeze(1), volatile=True) # UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead. with torch.no_grad(): targets = targets.squeeze(1) # Compute output outputs = model(inputs) loss = criterion(outputs, targets) # Measure accuracy and record loss prec1 = accuracy(outputs.data, targets.data, topk=(1,)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1[0], inputs.size(0)) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Plot progress bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f}'.format( batch=batch_idx + 1, size=len(val_loader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, top1=top1.avg, ) bar.next() bar.finish() return (losses.avg, top1.avg)
def train(train_loader, model, criterion, optimizer, epoch): losses = misc.AverageMeter() top1 = misc.AverageMeter() top5 = misc.AverageMeter() # switch to train mode prefetcher = datasets.DataPrefetcher(train_loader) model.train() input, target = prefetcher.next() i = -1 while input is not None: i += 1 output = model(input) loss = criterion(output, target) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if i % args.log_interval == 0: prec1, prec5 = misc.accuracy(output.data, target, topk=(1, 5)) # Average loss and accuracy across processes for logging reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) # to_python_float incurs a host<->device sync losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) torch.cuda.synchronize() print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.10f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), loss=losses, top1=top1, top5=top5)) input, target = prefetcher.next()
def train_step(model, optimizer, images, label, train = True): with tf.GradientTape() as pred_tape: pred = model(images, train) pred_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits = pred) current_loss = tf.reduce_mean(pred_loss) accu = accuracy(pred, label) print("Current Training Loss : %f, Accuracy : %f" %(current_loss, accu)) if train: gradient_of_predictor = pred_tape.gradient(pred_loss, model.trainable_variables) optimizer.apply_gradients(zip(gradient_of_predictor, model.trainable_variables))
def test_wrapper(data_loader, data_size, model, use_gpu=True): model.eval() accumulator = inst_meter_dict( ['top_1_acc', 'top_5_acc', 'data_time', 'batch_time']) tic = time.time() with no_grad(): # close all grads, operations inside don't track history toc = time.time() for batch_index, (inputs, labels) in enumerate(data_loader['test']): batch_size = inputs.size(0) accumulator['data_time'].update(time.time() - toc) if use_gpu: try: inputs, labels = Variable(inputs.float().cuda()), Variable( labels.long().cuda(async=True)) except: logging.error(inputs, labels) else: inputs, labels = Variable(inputs), Variable(labels) outputs = model(inputs) acc_1, acc_5 = accuracy(outputs.data, labels.data, topk=(1, 5)) accumulator['top_1_acc'].update(acc_1.item(), batch_size) accumulator['top_5_acc'].update(acc_5.item(), batch_size) accumulator['batch_time'].update(time.time() - toc) toc = time.time() if (batch_index + 1) % 10 == 0: logging.info('[{}/{}] data: {:.4f}s | batch: {:.4f}s'.format( batch_index + 1, len(data_loader["test"]), accumulator['data_time'].val, accumulator['batch_time'].val)) accumulator['data_time'].reset() accumulator['batch_time'].reset() logging.info('top-1: {:.4f} | top-5: {:.4f} | time: {:.4f}'.format( accumulator['top_1_acc'].avg, accumulator['top_5_acc'].avg, time.time() - tic))
loss_ce = criterion(output, target) loss_reg = args.lambd * (torch.cat(gates_params).abs().mean() - args.sparsity_level)**2 loss = loss_ce + loss_reg loss.backward() optimizer.step() for p in gates_params: p.data.clamp_(0, 1) if i % args.log_interval == 0: concat_channels = torch.cat(gates_params) sparsity = (concat_channels != 0).float().mean() mean_gate = concat_channels.mean() prec1, prec5 = misc.accuracy(output, target, topk=(1, 5)) top1.update(prec1.item(), data.size(0)) top5.update(prec5.item(), data.size(0)) print( 'Train Iter [%d/%d]\tLoss: %.4f, Loss_CE: %.4f, Loss_REG: %.4f, ' 'Sparsity: %.4f, Mean gate: %.4f, Top1 acc: %.4f, Top5 acc: %.4f' % (i, len(train_loader), loss.item(), loss_ce.item(), loss_reg.item(), sparsity.item(), mean_gate.item(), top1.avg, top5.avg)) if i % args.eval_interval == 0 and i > 0: acc, test_sparsity = test() if test_sparsity <= args.sparsity_level and acc > best_acc: best_acc = acc torch.save(model.state_dict(),
def train(train_loader, model, criterion, optimizer, epoch, ows_state, args): meters = defaultdict(misc.AverageMeter) model.train() filters = model.filters if hasattr(model, 'filters') else model.module.filters history = defaultdict(list) end = time.time() for iteration, (input, target) in enumerate(train_loader): if "mini" in args.debug and iteration > 20: break best_path, temperature, gamma_max, best_perf, timing = solve_ows( model, epoch, len(train_loader), iteration, ows_state, args) # measure data loading time meters["data_time"].update(time.time() - end) if not args.no_cuda: target = target.cuda(non_blocking=True) compute_results = misc.SWDefaultDict(misc.SWDict) minconf = [F.configurations[0] for F in filters] maxconf = [F.configurations[-1] for F in filters] optimizer.zero_grad() # sandwich rule: train maximum configuration outp = model(input, configuration=maxconf) loss = criterion(outp['x'], target) loss.mean().backward() compute_results['max']['x'] = outp['x'].detach() compute_results['max']['loss_numpy'] = loss.detach().cpu().numpy() compute_results['max']['prob'] = torch.nn.functional.softmax(compute_results['max']['x'], dim=1) # sandwich rule: train minimum and random configuration with self-distillation for kind in ('min', 'rand'): conf = None if kind == 'rand' else minconf outp = model(input, configuration=conf) loss = misc.soft_cross_entropy(outp['x'], compute_results['max']['prob'].detach()) compute_results[kind]['soft_loss_numpy'] = loss.detach().cpu().numpy() with torch.no_grad(): hard_loss_numpy = criterion(outp['x'], target).detach().cpu().numpy() compute_results[kind]['loss_numpy'] = hard_loss_numpy compute_results[kind]['x'] = outp['x'].detach() if kind == 'rand': compute_results['rand']['decision'] = outp['decision'].cpu().numpy() loss.mean().backward() for path, image_loss, image_refloss in zip(compute_results['rand']['decision'], compute_results['rand']['loss_numpy'], compute_results['max']['loss_numpy']): for i, pi in enumerate(path): ows_state.histories[i][pi].update(-(image_loss - image_refloss) / len(path), epoch, iteration) for refname in ('min', 'max', 'rand'): meters['loss_' + kind].update(compute_results[kind]['loss_numpy'].mean(), input.size(0)) refloss = compute_results[refname]['loss_numpy'] (prec1, prec5), refcorrect_ks = misc.accuracy(compute_results[refname]['x'].data, target, topk=(1, 5), return_correct_k=True) refcorrect1, refcorrect5 = [a.cpu().numpy().astype(bool) for a in refcorrect_ks] history['loss_' + refname].append(refloss) history['top1_' + refname].append(refcorrect1) history['top5_' + refname].append(refcorrect5) meters['top1_' + refname].update(prec1.item(), input.size(0)) meters['top5_' + refname].update(prec5.item(), input.size(0)) if 'soft_loss_numpy' in compute_results[refname]: meters['loss_soft_' + kind].update(compute_results[kind]['soft_loss_numpy'].mean(), input.size(0)) history['loss_soft_' + refname].append(compute_results[refname]['soft_loss_numpy']) history['configuration'].append(compute_results['rand']['decision']) history['configuration'].append(compute_results['rand']['loss_numpy']) optimizer.step() # measure elapsed time meters["batch_time"].update(time.time() - end) end = time.time() if iteration % args.print_freq == 0: toprint = f"Epoch: [{epoch}][{iteration}/{len(train_loader)}]\t" toprint += ('Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Prec@1 {top1_rand.val:.3f} ({top1_rand.avg:.3f})\t' 'Prec@5 {top5_rand.val:.3f} ({top5_rand.avg:.3f})\t'.format(**meters)) for key, meter in meters.items(): if key.startswith('loss'): toprint += f'{key} {meter.val:.4f} ({meter.avg:.4f})\t' logger.info(toprint) # prints a string summarizing the sampling probabilities for each filter probas_str = "" for i, F in enumerate(filters): if F.probability is not None: probas_str += '|{} '.format(i) for p in F.probability: probas_str += str(int(100 * p)) + ' ' probas_log = None if any(F.probability is not None for F in filters): probas_log = tuple(F.probability for F in filters), history['OWS'].append(dict(best_path=best_path, temperature=temperature, gamma_max=gamma_max, best_pref=best_perf, pred_latency=timing, probas_log=probas_log)) if probas_str: probas_str = '\n' + probas_str ows_str = f"predicted latency: {timing}, perf: {best_perf}, T: {temperature}, gamma: {gamma_max}" logger.info('best_path: ' + ','.join(map(str, best_path)) + ows_str + probas_str) return history
def generic_train(data_loader, data_size, model, criterion, optimizer, lr_scheduler, max_epoch=100, use_gpu=True, pre_eval=False): tic = time.time() best_model = model best_acc = 0.0 temporary = inst_meter_dict( ['batch_time', 'data_time', 'losses', 'top_1_acc', 'top_5_acc']) accumulator = inst_meter_dict(['losses', 'top_1_acc', 'top_5_acc']) # pre-evaluation phase to check cuda memory if pre_eval: logging.info('Validation [0/{}]:'.format(max_epoch)) model.eval() toc = time.time() with no_grad( ): # close all grads, operations inside don't track history batch_size = 0 for batch_index, (inputs, labels) in enumerate(data_loader['dev']): if batch_size == 0: batch_size = inputs.size(0) temporary['data_time'].update(time.time() - toc) # wrap in Variable if use_gpu: try: inputs, labels = Variable( inputs.float().cuda()), Variable( labels.long().cuda(async=True)) except: logging.error(inputs, labels) else: inputs, labels = Variable(inputs), Variable(labels) outputs = model(inputs) loss = criterion(outputs, labels) acc_1, acc_5 = accuracy(outputs.data, labels.data, topk=(1, cfg.TRAIN.METRICS_TOP_K_ACC)) accumulator['losses'].update(loss.item(), batch_size) accumulator['top_1_acc'].update(acc_1.item(), batch_size) accumulator['top_5_acc'].update(acc_5.item(), batch_size) logging.info( '[{}/{}] loss: {:.4f} | top-1: {:.4f} | top-5: {:.4f}'.format( 0, max_epoch, accumulator['losses'].avg, accumulator['top_1_acc'].avg, accumulator['top_5_acc'].avg)) logging.info( 'Pre-evaluation done, validation batch-size:{}, everything is ok' .format(batch_size)) # training and validation for epoch in range(max_epoch): is_best = False use_mixup = cfg.TRAIN.MIXUP if use_mixup: logging.info('Mix-up used during training') if epoch not in xrange(cfg.TRAIN.MU.ACTIVE_EPOCH_RANGE[0], cfg.TRAIN.MU.ACTIVE_EPOCH_RANGE[1]): use_mixup = False logging.info('Mix-up switch OFF') else: logging.info('Mix-up switch ON') # Each epoch has a training and validation phase # ---- training phase ---- optimizer = update_lr(optimizer, epoch, lr_scheduler) logging.info('Training epoch [{}/{}]: learning rate {}'.format( epoch + 1, max_epoch, optimizer.param_groups[0]['lr'])) model.train() # Set model to training mode # Iterate over data. toc = time.time() for batch_index, (inputs, labels) in enumerate(data_loader["train"]): batch_size = inputs.size(0) temporary['data_time'].update(time.time() - toc) # wrap in Variable if use_gpu: try: inputs, labels = Variable(inputs.float().cuda()), Variable( labels.long().cuda(async=True)) if use_mixup: inputs, targets_a, targets_b, lam = mixup_data( inputs, labels, cfg.TRAIN.MU.ALPHA) inputs, targets_a, targets_b = map( Variable, (inputs, targets_a, targets_b)) except: logging.error('\n==> inputs:\n{}\n==> labels:\n{}'.format( inputs, labels)) return 0 else: inputs, labels = Variable(inputs), Variable(labels) # Set gradient to zero to delete history of computations in previous epoch. Track operations so that differentiation can be done automatically. optimizer.zero_grad() outputs = model(inputs) if use_mixup: loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) acc_1, acc_5 = mixup_accuracy( outputs.data, targets_a, targets_b, lam, topk=(1, cfg.TRAIN.METRICS_TOP_K_ACC)) else: loss = criterion(outputs, labels) acc_1, acc_5 = accuracy(outputs.data, labels.data, topk=(1, cfg.TRAIN.METRICS_TOP_K_ACC)) # losses.update(loss.data[0], inputs.size(0)) temporary['losses'].update(loss.item(), batch_size) temporary['top_1_acc'].update(acc_1.item(), batch_size) temporary['top_5_acc'].update(acc_5.item(), batch_size) accumulator['losses'].update(loss.item(), batch_size) accumulator['top_1_acc'].update(acc_1.item(), batch_size) accumulator['top_5_acc'].update(acc_5.item(), batch_size) # backward + optimize only if in training phase optimizer.zero_grad() loss.backward() optimizer.step() # print evaluation statistics temporary['batch_time'].update(time.time() - toc) toc = time.time() if (batch_index + 1) % cfg.TRAIN.LOG_INTERVAL == 0: logging.info( '[{}/{}] [{}/{}] data: {:.4f}s | batch: {:.4f}s | loss: {:.4f} | top-1: {:.4f} | top-5: {:.4f}' .format( epoch + 1, max_epoch, batch_index + 1, len(data_loader["train"]), temporary['data_time'].val, temporary['batch_time'].val, temporary['losses'].avg, temporary['top_1_acc'].avg, temporary['top_5_acc'].avg, )) temporary['data_time'].reset() temporary['batch_time'].reset() temporary['losses'].reset() temporary['top_1_acc'].reset() temporary['top_5_acc'].reset() logging.info( '[{}/{}] loss: {:.4f} | top-1: {:.4f} | top-5: {:.4f}'.format( epoch + 1, max_epoch, accumulator['losses'].avg, accumulator['top_1_acc'].avg, accumulator['top_5_acc'].avg)) accumulator['losses'].reset() accumulator['top_1_acc'].reset() accumulator['top_5_acc'].reset() # ------------------------- # ---- validation phase ---- logging.info('Validation [{}/{}]:'.format(epoch + 1, max_epoch)) model.eval() # Iterate over data. toc = time.time() with no_grad( ): # close all grads, operations inside don't track history for batch_index, (inputs, labels) in enumerate(data_loader["dev"]): batch_size = inputs.size(0) temporary['data_time'].update(time.time() - toc) # wrap in Variable if use_gpu: try: inputs, labels = Variable( inputs.float().cuda()), Variable( labels.long().cuda(async=True)) except: logging.error( '\n==> inputs:\n{}\n==> labels:\n{}'.format( inputs, labels)) return 0 else: inputs, labels = Variable(inputs), Variable(labels) # Set gradient to zero to delete history of computations in previous epoch. Track operations so that differentiation can be done automatically. optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) acc_1, acc_5 = accuracy(outputs.data, labels.data, topk=(1, cfg.TRAIN.METRICS_TOP_K_ACC)) # losses.update(loss.data[0], inputs.size(0)) temporary['losses'].update(loss.item(), batch_size) temporary['top_1_acc'].update(acc_1.item(), batch_size) temporary['top_5_acc'].update(acc_5.item(), batch_size) accumulator['losses'].update(loss.item(), batch_size) accumulator['top_1_acc'].update(acc_1.item(), batch_size) accumulator['top_5_acc'].update(acc_5.item(), batch_size) # check if current model is best logging.info('Current validation accuracy: {:.4f}'.format( accumulator['top_1_acc'].avg)) if accumulator['top_1_acc'].avg > best_acc: is_best = True best_acc = accumulator['top_1_acc'].avg # best_model = copy.deepcopy(model) logging.info('New best accuracy: {:.4f}'.format(best_acc)) accumulator['losses'].reset() accumulator['top_1_acc'].reset() accumulator['top_5_acc'].reset() # -------------------------- # ---- save checkpoint ---- if (epoch + 1) % cfg.TRAIN.SAVE_INTERVAL == 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': accumulator['top_1_acc'].avg, # 'best_acc': best_acc, 'optimizer': optimizer.state_dict() }, cfg.TRAIN.OUTPUT_MODEL_PREFIX, is_best=is_best) logging.info('Checkpoint saved to {}-{:0>4}.pth.tar'.format( cfg.TRAIN.OUTPUT_MODEL_PREFIX, epoch + 1)) # ------------------------ time_elapsed = int(time.time() - tic) logging.info('Training job complete in {:d}:{:0>2d}:{:d}'.format( time_elapsed // 3600, (time_elapsed - 3600 * (time_elapsed // 3600)) // 60, (time_elapsed - 60 * (time_elapsed // 60)))) logging.info('Best val Acc: {:4f}'.format(best_acc)) return 0
def classify(self, x, y, train=True, mode='base'): x = self.__call__(x, train=train, mode=mode) cent = softmax_cross_entropy(x, y) acc = accuracy(x, y) return cent, acc