def run_training(args): writer_path = os.path.join('runs', args.exp_desc + '-' + time.strftime('%Y-%m-%d-%H:%M:%S', time.localtime())) writer = SummaryWriter(writer_path) signsgd_config = { 'num_bits': args.num_bits, 'num_bits_weight': args.num_bits_weight, 'num_bits_grad': args.num_bits_grad, 'biprecision': args.biprecision, 'predictive_forward': args.predictive_forward, 'predictive_backward': args.predictive_backward, 'msb_bits': args.msb_bits, 'msb_bits_weight': args.msb_bits_weight, 'msb_bits_grad': args.msb_bits_grad, 'threshold': args.threshold, 'sparsify': args.sparsify, 'sign': args.sign, 'writer': writer, } # create model model = models.__dict__[args.arch](args.pretrained, **signsgd_config) model.install_gate() model = torch.nn.DataParallel(model).cuda() best_prec1 = 0 # optionally resume from a checkpoint if args.resume: checkpoint = torch.load(args.resume) if os.path.isfile(args.resume): logging.info('=> loading checkpoint `{}`'.format(args.resume)) args.start_iter = 0 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict'], strict=True) # translate(model, checkpoint) logging.info('=> loaded checkpoint `{}` (iter: {})'.format( args.resume, checkpoint['iter'] )) else: logging.info('=> no checkpoint found at `{}`'.format(args.resume)) cudnn.benchmark = True train_loader = prepare_train_data(dataset=args.dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) test_loader = prepare_test_data(dataset=args.dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() cp_energy_record = AverageMeter() skip_ratios = ListAverageMeter() end = time.time() dataloader_iterator = iter(train_loader) for i in range(0, args.iters): rand_flag = random.uniform(0, 1) > 0.5 model.train() adjust_learning_rate(args, optimizer, i) try: input, target = next(dataloader_iterator) except StopIteration: dataloader_iterator = iter(train_loader) input, target = next(dataloader_iterator) # measuring data loading time data_time.update(time.time() - end) target = target.cuda() input_var = Variable(input, requires_grad=True).cuda() target_var = Variable(target).cuda() # compute output if rand_flag: optimizer.zero_grad() # optimizer.step() global skip_count skip_count += 1 continue output, masks, _, has_ds = model(input_var) # energy_parameter = np.ones(35,) energy_parameter = np.ones(len(masks),) for iii, flag in enumerate(has_ds): if flag: energy_parameter[iii] = 0.75 energy_parameter /= energy_parameter.max() energy_cost = 0 energy_all = 0 for layer in range(len(energy_parameter)): energy_cost += masks[layer].sum() * energy_parameter[layer] energy_all += reduce((lambda x, y: x * y), masks[layer].shape) * energy_parameter[layer] cp_energy = (energy_cost.item() / energy_all.item()) * 100 global training_cost training_cost += (cp_energy / 100) * 0.51 * args.batch_size energy_cost *= args.beta if cp_energy <= args.minimum: # if cp_energy > args.minimum: reg = -1 else: reg = 1 if args.energy: loss = criterion(output, target_var) + energy_cost * reg else: loss = criterion(output, target_var) # collect skip ratio of each layer skips = [mask.data.le(0.5).float().mean() for mask in masks] if skip_ratios.len != len(skips): skip_ratios.set_len(len(skips)) # measure accuracy and record loss prec1, = accuracy(output.data, target, topk=(1,)) writer.add_scalar('data/train_error', 100 - prec1, i-skip_count) writer.add_scalar('data/train_comp_using', cp_energy, i-skip_count) writer.add_scalar('data/train_cost_Gops', training_cost, i-skip_count) losses.update(loss.data.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) cp_energy_record.update(cp_energy, 1) skip_ratios.update(skips, input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # repackage hidden units for RNN Gate model.module.control.repackage_hidden() batch_time.update(time.time() - end) end = time.time() # print log if i % args.print_freq == 0 or i == (args.iters - 1): logging.info("Iter: [{0}/{1}]\t" "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" "Loss {loss.val:.3f} ({loss.avg:.3f})\t" "Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t" 'Energy_ratio: {cp_energy_record.val:.3f}({cp_energy_record.avg:.3f})\t'.format( i, args.iters, batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, cp_energy_record=cp_energy_record) ) # evaluate every 1000 steps if (i % args.eval_every == 0 and i > 0) or (i == (args.iters-1)): prec1 = validate(args, test_loader, model, criterion) writer.add_scalar('data/test_error', 100 - prec1, i-skip_count) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) checkpoint_path = os.path.join(args.save_path, 'checkpoint_{:05d}.pth.tar'.format( i)) save_checkpoint({ 'iter': i, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=checkpoint_path) shutil.copyfile(checkpoint_path, os.path.join(args.save_path, 'checkpoint_latest' '.pth.tar'))
def validate(args, test_loader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() skip_ratios = ListAverageMeter() cp_energy_record = AverageMeter() # switch to evaluation mode model.eval() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(test_loader): if i == len(test_loader) - 1: break target = target.cuda() input_var = Variable(input).cuda() target_var = Variable(target).cuda() # compute output output, masks, logprobs, has_ds = model(input_var) # energy_parameter = np.ones(35, ) energy_parameter = np.ones(len(masks), ) for iii, flag in enumerate(has_ds): if flag: energy_parameter[iii] = 0.75 energy_parameter /= energy_parameter.max() energy_cost = 0 energy_all = 0 for layer in range(len(energy_parameter)): energy_cost += masks[layer].sum() * energy_parameter[layer] energy_all += reduce((lambda x, y: x * y), masks[layer].shape) * energy_parameter[layer] cp_energy = (energy_cost.item() / energy_all.item()) * 100 skips = [mask.data.le(0.5).float().mean().item() for mask in masks] if skip_ratios.len != len(skips): skip_ratios.set_len(len(skips)) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1,5)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) skip_ratios.update(skips, input.size(0)) losses.update(loss.data.item(), input.size(0)) batch_time.update(time.time() - end) cp_energy_record.update(cp_energy, 1) end = time.time() if i % args.print_freq == 0 or (i == (len(test_loader) - 1)): logging.info( 'Test: [{}/{}]\t' 'Time: {batch_time.val:.4f}({batch_time.avg:.4f})\t' 'Loss: {loss.val:.3f}({loss.avg:.3f})\t' 'Prec@1: {top1.val:.3f}({top1.avg:.3f})\t' 'Prec@5: {top1.val:.3f}({top5.avg:.3f})\t' 'Energy_ratio: {cp_energy_record.val:.3f}({cp_energy_record.avg:.3f})\t'.format( i, len(test_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5, cp_energy_record=cp_energy_record, ) ) logging.info(' * Prec@1 {top1.avg:.3f}, Loss {loss.avg:.3f}'.format( top1=top1, loss=losses)) skip_summaries = [] for idx in range(skip_ratios.len): skip_summaries.append(1-skip_ratios.avg[idx]) # compute `computational percentage` cp = ((sum(skip_summaries) + 1) / (len(skip_summaries) + 1)) * 100 logging.info('*** Computation Percentage: {:.3f} %'.format(cp)) return top1.avg
def forward(data_loader, model, criterion, epoch, training, model_type, optimizer=None, writer=None): if training: model.train() else: model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() total_steps = len(data_loader) for i, (inputs, target) in enumerate(data_loader): # measure data loading time data_time.update(time.time() - end) inputs = inputs.to('cuda:0') target = target.to('cuda:0') # compute output output = model(inputs) if model_type == 'int': # omit the output exponent output, output_exp = output output = output.float() loss = criterion(output * (2**output_exp.float()), target) else: output_exp = 0 loss = criterion(output, target) # measure accuracy and record loss losses.update(float(loss), inputs.size(0)) prec1, prec5 = accuracy(output.detach(), target, topk=(1, 5)) top1.update(float(prec1), inputs.size(0)) top5.update(float(prec5), inputs.size(0)) if training: if model_type == 'int': model.backward(target) elif model_type == 'hybrid': # float backward optimizer.update(epoch, epoch * len(data_loader) + i) optimizer.zero_grad() loss.backward() optimizer.step() #int8 backward model.backward() else: optimizer.update(epoch, epoch * len(data_loader) + i) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_interval == 0 and training: logging.info('{model_type} [{0}][{1}/{2}] ' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Data {data_time.val:.2f} ' 'loss {loss.val:.3f} ({loss.avg:.3f}) ' 'e {output_exp:d} ' '@1 {top1.val:.3f} ({top1.avg:.3f}) ' '@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(data_loader), model_type=model_type, batch_time=batch_time, data_time=data_time, loss=losses, output_exp=output_exp, top1=top1, top5=top5)) if args.grad_hist: if args.model_type == 'int': for idx, l in enumerate(model.forward_layers): if hasattr(l, 'weight'): grad = l.grad_int32acc writer.add_histogram( 'Grad/' + l.__class__.__name__ + '_' + str(idx), grad, epoch * total_steps + i) elif args.model_type == 'float': for idx, l in enumerate(model.layers): if hasattr(l, 'weight'): writer.add_histogram( 'Grad/' + l.__class__.__name__ + '_' + str(idx), l.weight.grad, epoch * total_steps + i) for idx, l in enumerate(model.classifier): if hasattr(l, 'weight'): writer.add_histogram( 'Grad/' + l.__class__.__name__ + '_' + str(idx), l.weight.grad, epoch * total_steps + i) return losses.avg, top1.avg, top5.avg