def train(train_queue, model, criterion, optimizer, epoch, init_lr, warmup_epochs, global_step): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.train() for step, (data, target) in enumerate(train_queue): n = data.size(0) data = data.cuda() target = target.cuda() # Change lr. if epoch < warmup_epochs: len_epoch = len(train_queue) scale = float(1 + step + epoch * len_epoch) / \ (warmup_epochs * len_epoch) lr = init_lr * scale for param_group in optimizer.param_groups: param_group['lr'] = lr # Forward. optimizer.zero_grad() logits, logits_aux = model(data) loss = criterion(logits, target) if args.auxiliary: loss_aux = criterion(logits_aux, target) loss += args.auxiliary_weight * loss_aux # Backward and step. loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() ############# APEX ############# # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) reduced_loss = utils.reduce_tensor(loss.data, args.world_size) prec1 = utils.reduce_tensor(prec1, args.world_size) prec5 = utils.reduce_tensor(prec5, args.world_size) objs.update(to_python_float(reduced_loss), n) top1.update(to_python_float(prec1), n) top5.update(to_python_float(prec5), n) ################################ if step % args.report_freq == 0: current_lr = list(optimizer.param_groups)[0]['lr'] logging.info('train %03d %e %f %f lr: %e', step, objs.avg, top1.avg, top5.avg, current_lr) writer.add_scalar('train/loss', objs.avg, global_step) writer.add_scalar('train/acc_top1', top1.avg, global_step) writer.add_scalar('train/acc_top5', top5.avg, global_step) writer.add_scalar('train/lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) global_step += 1 return top1.avg, objs.avg, global_step
def train(train_queue, model, criterion, optimizer, global_step): objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() model.train() for step, (data, target) in enumerate(train_queue): n = data.size(0) data = data.cuda() target = target.cuda() # Forward. optimizer.zero_grad() logits = model(data) loss = criterion(logits, target) # Backward and step. loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) reduced_loss = utils.reduce_tensor(loss.data, args.world_size) prec1 = utils.reduce_tensor(prec1, args.world_size) prec5 = utils.reduce_tensor(prec5, args.world_size) objs.update(to_python_float(reduced_loss), n) top1.update(to_python_float(prec1), n) top5.update(to_python_float(prec5), n) if (step + 1) % args.report_freq == 0: current_lr = list(optimizer.param_groups)[0]['lr'] logging.info('train %03d %e %f %f lr: %e', step, objs.avg, top1.avg, top5.avg, current_lr) global_step += 1 return top1.avg, top5.avg, objs.avg, global_step
def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0 and args.local_rank == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), to_python_float(loss.data)))
def test(): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: with torch.no_grad(): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) output = model(data) test_loss += to_python_float( F.nll_loss(output, target, size_average=False).data) # sum up batch loss pred = output.data.max( 1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def train(train_loader, model, criterion, optimizer, epoch, use_cuda, logger): global batch_time_global, data_time_global # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() train_loader_len = len(train_loader) # print('Length of train loader = %i\n'%train_loader_len) bar = Bar('Processing', max=train_loader_len) for batch_idx, (inputs, targets) in enumerate(train_loader): # measure data loading time data_time_lap = time.time() - end data_time.update(data_time_lap) if epoch > 0: data_time_global.update(data_time_lap) n = inputs.size(0) if use_cuda: inputs = inputs.cuda() targets = targets.cuda() # print('input size = %i, device %s\n'%(inputs.size(0), inputs.device)) # compute output optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) # Backward and step. loss.backward() optimizer.step() # measure accuracy and record loss prec1, prec5 = accuracy(outputs, targets, topk=(1, 5)) reduced_loss = reduce_tensor(loss.data, args.world_size) prec1 = reduce_tensor(prec1, args.world_size) prec5 = reduce_tensor(prec5, args.world_size) losses.update(to_python_float(reduced_loss), n) top1.update(to_python_float(prec1), n) top5.update(to_python_float(prec5), n) # for restarting if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': iter_count, iter_total = optimizer.update_iter() # measure elapsed time batch_time_lap = time.time() - end batch_time.update(batch_time_lap) if epoch > 0: batch_time_global.update(batch_time_lap) end = time.time() # plot progress bar.suffix = '(Epoch {epoch}, {batch}/{size}) Data: {data:.3f}s/{data_global:.3f}s | Batch: {bt:.3f}s/{bt_global:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format( epoch=epoch, batch=batch_idx + 1, size=train_loader_len, data=data_time.val, data_global=data_time_global.avg, bt=batch_time.val, bt_global=batch_time_global.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, top1=top1.avg, top5=top5.avg, ) bar.next() if args.local_rank == 0: logger.file.write(bar.suffix) bar.finish() if args.optimizer.lower( ) == 'srsgd' or args.optimizer.lower() == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': return (losses.avg, top1.avg, top5.avg, iter_count) else: return (losses.avg, top1.avg, top5.avg)
def step(self, input_valid, target_valid, global_step, weights, input_valid2=None, target_valid2=None, model_opt=None): """Optimizer for the architecture params.""" self.arch_optimizer.zero_grad() if self.meta_loss == 'default': loss, accuracy, loss1, loss2 = self.training_obj( input_valid, target_valid, weights, model_opt, input_valid2, target_valid2, global_step) loss, loss1, loss2 = torch.mean(loss), torch.mean( loss1), torch.mean(loss2) elif self.meta_loss == 'rebar': # compute loss with discrete weights with torch.no_grad(): disc_weights = { 'normal': weights['dis_normal'], 'reduce': weights['dis_reduce'] } loss_disc, accuracy, loss1, loss2 = self.training_obj( input_valid, target_valid, disc_weights, model_opt, input_valid2, target_valid2, global_step) # compute baseline loss_cont, _, _, _ = self.training_obj(input_valid, target_valid, weights, model_opt, input_valid2, target_valid2, global_step) reward = (loss_disc - loss_cont).detach() log_q_d = self.alpha.module.log_prob(weights) loss = torch.mean(log_q_d * reward) + torch.mean(loss_cont) loss1, loss2 = torch.mean(loss1), torch.mean(loss2) if self.latency_cost: # train the surrogate function initially. if self.surrogate_not_train: self.train_surrogate(input_valid) # sample a single architecture sample weight_lat = self.alpha(1) disc_weights_lat = { 'normal': weight_lat['dis_normal'], 'reduce': weight_lat['dis_reduce'] } # compute latency for the discrete weights. elapsed_time = self.compute_latency(input_valid, disc_weights_lat) # latency prediction for continuous weights self.surrogate.eval() alphas = self.alpha.module.get_arch_sample(weight_lat) latency_cont = self.surrogate(alphas) # latency prediction for discrete weights alphas = self.alpha.module.get_arch_sample(disc_weights_lat) latency_discrete = self.surrogate(alphas) surrogate_loss = torch.mean( torch.abs(elapsed_time - latency_discrete.squeeze(1))) self.latency_coeff_curr = self.latency_coeff * max( min(global_step / self.args.latency_iter, 1.0), 0.) loss_disc_lat = self.latency_coeff_curr * torch.relu( torch.Tensor([elapsed_time]).cuda() - self.target_latency) loss_cont_lat = self.latency_coeff_curr * torch.relu( latency_cont[0] - self.target_latency) # collect latency information self.latency_pred_loss.update( utils.reduce_tensor(surrogate_loss.data, self.args.world_size)) self.latency_value.update(elapsed_time) self.latency_actual.append(elapsed_time) self.latency_estimate.append( latency_discrete.squeeze(1).data.cpu().numpy()[0]) if global_step % 50 == 0: self.logging.info('latency_pred_loss %f' % np.mean( np.abs( np.array(self.latency_actual)[-50:] - np.array(self.latency_estimate)[-50:]))) # saving some latency info if global_step % 1000 == 100 and self.args.local_rank == 0: import pickle print('saving') with open(os.path.join(self.args.save, 'latency.pkl'), 'wb') as f: pickle.dump([ self.latency_actual, self.latency_estimate, global_step ], f) reward = (loss_disc_lat - loss_cont_lat).detach() log_q_d = self.alpha.module.log_prob(weight_lat) loss = loss + torch.mean( log_q_d * reward) + torch.mean(loss_cont_lat) elif self.meta_loss == 'reinforce': # compute loss with discrete weights with torch.no_grad(): disc_weights = self.alpha.module.discretize(weights) loss_disc, accuracy, loss1, loss2 = self.training_obj( input_valid, target_valid, disc_weights, model_opt, input_valid2, target_valid2, global_step) reduce_loss_disc = utils.reduce_tensor(loss_disc.data, self.args.world_size) avg = torch.mean(reduce_loss_disc).detach() baseline = self.exp_avg1.avg # update the moving average self.exp_avg1.update(avg) reward = (loss_disc - baseline).detach() log_q_d = self.alpha.module.log_prob(weights) loss = torch.mean(log_q_d * reward) + baseline loss1, loss2 = torch.mean(loss1), torch.mean(loss2) if self.latency_cost: weight_lat = self.alpha(1) disc_weights_lat = self.alpha.module.discretize(weights) elapsed_time = self.compute_latency(input_valid, disc_weights_lat) self.latency_coeff_curr = self.latency_coeff * min( global_step / self.args.latency_iter, 1.0) loss_disc_lat = self.latency_coeff_curr * elapsed_time self.latency_value.update(elapsed_time) baseline = self.exp_avg2.avg # update the moving average self.exp_avg2.update(float(loss_disc_lat)) reward = loss_disc_lat - baseline log_q_d = self.alpha.module.log_prob(weight_lat) loss = loss + torch.mean(log_q_d * reward) + baseline loss1, loss2 = torch.mean(loss1), torch.mean(loss2) entropy_loss = self.alpha.module.entropy_loss(weights) # Backward pass and update. loss.backward() self.arch_optimizer.step() # Logging. reduced_loss = utils.reduce_tensor(loss.data, self.args.world_size) accuracy = utils.reduce_tensor(accuracy, self.args.world_size) self.loss.update(to_python_float(reduced_loss), 1) self.accuracy.update(to_python_float(accuracy), 1) self.count += 1 if self.count % self.report_freq == 0: self.logging.info('Meta Loss:%s %03d %e %f', self.meta_loss, self.count, self.loss.avg, self.accuracy.avg) self.writer.add_scalar('meta/loss', self.loss.avg, global_step) self.writer.add_scalar('meta/acc', self.accuracy.avg, global_step) self.writer.add_scalar( 'meta/lr', self.arch_optimizer.state_dict()['param_groups'][0]['lr'], global_step) self.writer.add_scalar('meta/entropy', entropy_loss, global_step) if self.gen_error_alpha: self.writer.add_scalar('meta/loss_val', loss1, global_step) self.writer.add_scalar('meta/loss_cov', loss2, global_step) self.writer.add_scalar('meta/loss_diff_sign', self.loss_diff_sign.avg, global_step) if self.latency_cost: self.writer.add_scalar('meta/latency_time', self.latency_value.avg, global_step) self.writer.add_scalar('meta/latency_prediction_loss', self.latency_pred_loss.avg, global_step) self.writer.add_scalar('meta/latency_coeff', self.latency_coeff_curr, global_step)