def _train_one_batch(self, x, y, optimizer, lr_scheduler, meters, criterions, end): top1_meter, top5_meter, loss_meter, data_time = meters criterion = criterions[0] world_size = dist.get_world_size() lr_scheduler.step(self.cur_step) self.cur_step += 1 data_time.update(time.time() - end) self.model.zero_grad() out = self.model(x) loss = criterion(out, y) loss /= world_size top1, top5 = accuracy(out, y, top_k=(1, 5)) reduced_loss = dist.all_reduce(loss.clone()) reduced_top1 = dist.all_reduce(top1.clone(), div=True) reduced_top5 = dist.all_reduce(top5.clone(), div=True) loss_meter.update(reduced_loss.item()) top1_meter.update(reduced_top1.item()) top5_meter.update(reduced_top5.item()) loss.backward() dist.average_gradient(self.model.parameters()) optimizer.step()
def validate(model, loss_func, val_loader): losses = AverageMeter('Loss', ':4.4f') top1 = AverageMeter('Acc@1', ':4.2f') top5 = AverageMeter('Acc@5', ':4.2f') progress = ProgressMeter(len(val_loader), [losses, top1, top5], prefix='Validation: ') model.eval() with torch.no_grad(): for i, data in enumerate(val_loader, 1): imgs = data['image'].to(device) label = data['label'].to(device) out = model(imgs) loss = loss_func(out, label) acc1, acc5 = accuracy(out, label, topk=(1, 5)) losses.update(loss.item()) top1.update(acc1[0]) top5.update(acc5[0]) progress.display(i) return losses.avg
def train(model, optimizer, loss_func, train_loader, epoch): losses = AverageMeter('Loss', ':4.4f') top1 = AverageMeter('Acc@1', ':4.2f') top5 = AverageMeter('Acc@5', ':4.2f') progress = ProgressMeter(len(train_loader), [losses, top1, top5], prefix="Epoch: [{}]".format(epoch + 1)) model.train() for i, data in enumerate(train_loader, 1): imgs = data['image'].to(device) label = data['label'].to(device) out = model(imgs) loss = loss_func(out, label) acc1, acc5 = accuracy(out, label, topk=(1, 5)) losses.update(loss.item()) top1.update(acc1[0]) top5.update(acc5[0]) optimizer.zero_grad() loss.backward() optimizer.step() if i % PRINT_FREQ == 0: progress.display(i) timer = time() - TIME print("Total time Elapsed (H:m:s):", timedelta(seconds=timer))
def validate(self, val_loader, tb_logger=None): batch_time = AverageMeter(0) loss_meter = AverageMeter(0) top1_meter = AverageMeter(0) top5_meter = AverageMeter(0) self.model.eval() criterion = nn.CrossEntropyLoss() end = time.time() with torch.no_grad(): for batch_idx, (x, y) in enumerate(val_loader): x, y = x.cuda(), y.cuda() num = x.size(0) out = self.model(x) loss = criterion(out, y) top1, top5 = accuracy(out, y, top_k=(1, 5)) loss_meter.update(loss.item(), num) top1_meter.update(top1.item(), num) top5_meter.update(top5.item(), num) batch_time.update(time.time() - end) end = time.time() if batch_idx % self.config.logging.print_freq == 0: self._info( 'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})' .format(batch_idx, len(val_loader), batch_time=batch_time)) total_num = torch.tensor([loss_meter.count]).cuda() loss_sum = torch.tensor([loss_meter.avg * loss_meter.count]).cuda() top1_sum = torch.tensor([top1_meter.avg * top1_meter.count]).cuda() top5_sum = torch.tensor([top5_meter.avg * top5_meter.count]).cuda() dist.all_reduce(total_num) dist.all_reduce(loss_sum) dist.all_reduce(top1_sum) dist.all_reduce(top5_sum) val_loss = loss_sum.item() / total_num.item() val_top1 = top1_sum.item() / total_num.item() val_top5 = top5_sum.item() / total_num.item() self._info( 'Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'.format( val_top1, val_top5, val_loss, loss_meter.count)) if dist.is_master(): if val_top1 > self.best_top1: self.best_top1 = val_top1 if tb_logger is not None: tb_logger.add_scalar('loss_val', val_loss, self.cur_step) tb_logger.add_scalar('acc1_val', val_top1, self.cur_step) tb_logger.add_scalar('acc5_val', val_top5, self.cur_step)
def _train_one_batch(self, x, y, optimizer, lr_scheduler, meters, criterions, end): top1_meter, top5_meter, loss_meter, data_time = meters criterion, distill_loss = criterions world_size = dist.get_world_size() max_width = self.config.training.sandwich.max_width lr_scheduler.step(self.cur_step) self.cur_step += 1 data_time.update(time.time() - end) self.model.zero_grad() max_pred = None for idx in range(self.config.training.sandwich.num_sample): # sandwich rule top1_m, top5_m, loss_m = self._set_width(idx, top1_meter, top5_meter, loss_meter) out = self.model(x) if self.config.training.distillation.enable: if idx == 0: max_pred = out.detach() loss = criterion(out, y) else: loss = self.config.training.distillation.loss_weight * \ distill_loss(out, max_pred) if self.config.training.distillation.hard_label: loss += criterion(out, y) else: loss = criterion(out, y) loss /= world_size top1, top5 = accuracy(out, y, top_k=(1, 5)) reduced_loss = dist.all_reduce(loss.clone()) reduced_top1 = dist.all_reduce(top1.clone(), div=True) reduced_top5 = dist.all_reduce(top5.clone(), div=True) loss_m.update(reduced_loss.item()) top1_m.update(reduced_top1.item()) top5_m.update(reduced_top5.item()) loss.backward() dist.average_gradient(self.model.parameters()) optimizer.step()
def _train_one_batch(self, x, y, optimizer, lr_scheduler, meters, criterions, end): lr_scheduler, arch_lr_scheduler = lr_scheduler optimizer, arch_optimizer = optimizer top1_meter, top5_meter, loss_meter, arch_loss_meter, \ floss_meter, eflops_meter, arch_top1_meter, data_time = meters criterion, _ = criterions self.model.module.set_alpha_training(False) super(DMCPRunner, self)._train_one_batch( x, y, optimizer, lr_scheduler, [top1_meter, top5_meter, loss_meter, data_time], criterions, end) arch_lr_scheduler.step(self.cur_step) world_size = dist.get_world_size() # train architecture params if self.cur_step >= self.config.arch.start_train \ and self.cur_step % self.config.arch.train_freq == 0: self._set_width(0, top1_meter, top5_meter, loss_meter) self.model.module.set_alpha_training(True) self.model.zero_grad() arch_out = self.model(x) arch_loss = criterion(arch_out, y) arch_loss /= world_size floss, eflops = flop_loss(self.config, self.model) floss /= world_size arch_top1 = accuracy(arch_out, y, top_k=(1, ))[0] reduced_arch_loss = dist.all_reduce(arch_loss.clone()) reduced_floss = dist.all_reduce(floss.clone()) reduced_eflops = dist.all_reduce(eflops.clone(), div=True) reduced_arch_top1 = dist.all_reduce(arch_top1.clone(), div=True) arch_loss_meter.update(reduced_arch_loss.item()) floss_meter.update(reduced_floss.item()) eflops_meter.update(reduced_eflops.item()) arch_top1_meter.update(reduced_arch_top1.item()) floss.backward() arch_loss.backward() dist.average_gradient(self.model.module.arch_parameters()) arch_optimizer.step()
def guided_train(summary_writer, log_per_epoch=100, print_freq=20): batch_time = AverageMeter() data_time = AverageMeter() low_prec_losses = AverageMeter() low_prec_top1 = AverageMeter() low_prec_top5 = AverageMeter() distance_meter = AverageMeter() # 状态转化为训练 low_prec_model.train() full_prec_model.eval() end = time.time() # 用于控制 tensorboard 的显示频率 interval = len(train_loader) // log_per_epoch summary_point = [ interval * split for split in torch.arange(log_per_epoch) ] for i, (input, target) in enumerate(train_loader): # measure checkpoint.pth data loading time data_time.update(time.time() - end) if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) # target 必须要转为 cuda 类型 # If ``True`` and the source is in pinned memory(固定内存), # the copy will be asynchronous(异步) with respect to the host target = target.cuda(args.gpu, non_blocking=True) full_prec_feature_map1.clear() low_prec_feature_map1.clear() full_prec_feature_map2.clear() low_prec_feature_map2.clear() # compute low_pre_output low_pre_output = low_prec_model(input) full_pre_output = full_prec_model(input) """Guided Key Point start""" # 将 distance 和 feature map放在同一个一gpu上 distance = torch.tensor([0.0]).cuda(args.gpu, non_blocking=True) num_layer3_features = 1 for dim in full_prec_feature_map1[0].size(): num_layer3_features *= dim num_layer4_features = 1 for dim in full_prec_feature_map2[0].size(): num_layer4_features *= dim for cudaid in full_prec_feature_map1: # 手动将feature map都搬到同一个 GPU 上 full_prec_feature_map1[cudaid] = full_prec_feature_map1[ cudaid].cuda(args.gpu, non_blocking=True) low_prec_feature_map1[cudaid] = low_prec_feature_map1[ cudaid].cuda(args.gpu, non_blocking=True) full_prec_feature_map2[cudaid] = full_prec_feature_map2[ cudaid].cuda(args.gpu, non_blocking=True) low_prec_feature_map2[cudaid] = low_prec_feature_map2[ cudaid].cuda(args.gpu, non_blocking=True) for cudaid in low_prec_feature_map1: """ RuntimeError: arguments are located on different GPUs 解决方法在于手动将feature map都搬到同一个 GPU 上 """ layer3 = ( quantize_activations_gemm(low_prec_feature_map1[cudaid]) - quantize_activations_gemm(full_prec_feature_map1[cudaid]) ).norm(p=args.norm) / num_layer3_features layer4 = ( quantize_activations_gemm(low_prec_feature_map2[cudaid]) - quantize_activations_gemm(full_prec_feature_map2[cudaid]) ).norm(p=args.norm) / num_layer4_features distance += (layer3 + layer4) / len(low_prec_feature_map1) distance *= args.balance """Guided Key Point end""" low_prec_loss = criterion(low_pre_output, target) low_prec_prec1, low_prec_prec5 = accuracy(low_pre_output, target, topk=(1, 5)) low_prec_losses.update(low_prec_loss.item(), input.size(0)) low_prec_top1.update(low_prec_prec1[0], input.size(0)) low_prec_top5.update(low_prec_prec5[0], input.size(0)) distance_meter.update(distance[0], 1) # compute gradient and do SGD step low_prec_optimizer.zero_grad() low_prec_loss.backward() low_prec_optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: print( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {low_prec_loss.val:.4f} ({low_prec_loss.avg:.4f})\t' 'Prec@1 {low_prec_top1.val:.3f} ({low_prec_top1.avg:.3f})\t' 'Prec@5 {low_prec_top5.val:.3f} ({low_prec_top5.avg:.3f}) \t' 'distance {distance.val:.3f} ({distance.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, low_prec_loss=low_prec_losses, low_prec_top1=low_prec_top1, low_prec_top5=low_prec_top5, distance=distance_meter)) if summary_writer is not None and (i in summary_point): step = i / interval + (epoch - 1) * log_per_epoch summary_writer.add_scalar("distance", distance_meter.avg, step) summary_writer.add_scalar("loss/low_prec_loss", low_prec_loss, step) summary_writer.add_scalar("train_low_prec/top-1", low_prec_top1.avg, step) summary_writer.add_scalar("train_low_prec/top-5", low_prec_top5.avg, step)
def validate(model, val_loader, criterion, gpu=0, epoch=0, summary_writer=None, name_prefix=None, print_freq=20): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() loss_name = "val/loss" prec1_name = "val/top-1" prec5_name = "val/top-5" if name_prefix is not None: name_prefix = ''.join((name_prefix, '-')) loss_name = ''.join((name_prefix, loss_name)) prec1_name = ''.join((name_prefix, prec1_name)) prec5_name = ''.join((name_prefix, prec5_name)) # 进入 eval 状态 model.eval() # if not full_precision: # qw = QuantizeWeightOrActivation() # 1, 创建量化器 # model.apply(qw.quantize_tanh) # 2, 量化权重, 保存全精度权重和量化梯度 with torch.no_grad(): start = time.time() for i, (data, target) in enumerate(val_loader): if gpu is not None: data = data.cuda(gpu, non_blocking=True) # batch_size 128时, target size 为 torch.Size([128]) target = target.cuda(gpu, non_blocking=True) output = model(data) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), data.size(0)) top1.update(prec1[0], data.size(0)) top5.update(prec5[0], data.size(0)) # measure elapsed time batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5)) if summary_writer is not None: summary_writer.add_scalar(loss_name, losses.avg, epoch) summary_writer.add_scalar(prec1_name, top1.avg, epoch) summary_writer.add_scalar(prec5_name, top5.avg, epoch) print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) # if not full_precision: # model.apply(qw.restore) # 第3步, 恢复全精度权重 return top1.avg
def train(model, train_loader, criterion, optimizer, gpu, epoch=0, summary_writer=None, log_per_epoch=100, print_freq=30): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() # if not full_precision: # qw = QuantizeWeightOrActivation() # 第一步, 创建量化器 end = time.time() # 用于控制 tensorboard 的显示频率 interval = len(train_loader) // log_per_epoch summary_point = [interval * split for split in torch.arange(log_per_epoch)] for i, (data, target) in enumerate(train_loader): data_time.update(time.time() - end) # measure checkpoint.pth data loading time if gpu is not None: data = data.cuda(gpu, non_blocking=True) target = target.cuda(gpu, non_blocking=True) # if not full_precision: # model.apply(qw.quantize_tanh) # 第二步, 量化权重, 保存全精度权重和量化梯度 output = model(data) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), data.size(0)) top1.update(prec1[0], data.size(0)) top5.update(prec5[0], data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() # if not full_precision: # model.apply(qw.restore) # 第三步, 反向传播后, 模型梯度计算后, 恢复全精度权重 # model.apply(qw.update_grad) # 第四步, 使用之前存储的量化梯度乘上反向传播的梯度 optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # 控制台 if i % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) if summary_writer and (i in summary_point): step = i//interval + epoch * log_per_epoch summary_writer.add_scalar("loss/train_loss", loss, step) summary_writer.add_scalar("train/top-1", top1.avg, step) summary_writer.add_scalar("train/top-5", top5.avg, step)
def validate(self, val_loader, train_loader=None, val_width=None, tb_logger=None): assert train_loader is not None assert val_width is not None batch_time = AverageMeter(0) loss_meter = [AverageMeter(0) for _ in range(len(val_width))] top1_meter = [AverageMeter(0) for _ in range(len(val_width))] top5_meter = [AverageMeter(0) for _ in range(len(val_width))] val_loss, val_top1, val_top5 = [], [], [] # switch to evaluate mode self.model.eval() criterion = nn.CrossEntropyLoss() end = time.time() with torch.no_grad(): for idx, width in enumerate(val_width): top1_m, top5_m, loss_m = self._set_width(idx, top1_meter, top5_meter, loss_meter, width=width) self._info('-' * 80) self._info('Evaluating [{}/{}]@{}'.format( idx + 1, len(val_width), width)) self.calibrate(train_loader) for j, (x, y) in enumerate(val_loader): x, y = x.cuda(), y.cuda() num = x.size(0) out = self.model(x) loss = criterion(out, y) top1, top5 = accuracy(out.data, y, top_k=(1, 5)) loss_m.update(loss.item(), num) top1_m.update(top1.item(), num) top5_m.update(top5.item(), num) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if j % self.config.logging.print_freq == 0: self._info( 'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})' .format(j, len(val_loader), batch_time=batch_time)) total_num = torch.tensor([loss_m.count]).cuda() loss_sum = torch.tensor([loss_m.avg * loss_m.count]).cuda() top1_sum = torch.tensor([top1_m.avg * top1_m.count]).cuda() top5_sum = torch.tensor([top5_m.avg * top5_m.count]).cuda() dist.all_reduce(total_num) dist.all_reduce(loss_sum) dist.all_reduce(top1_sum) dist.all_reduce(top5_sum) val_loss.append(loss_sum.item() / total_num.item()) val_top1.append(top1_sum.item() / total_num.item()) val_top5.append(top5_sum.item() / total_num.item()) self._info( 'Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'. format(val_top1[-1], val_top5[-1], val_loss[-1], loss_m.count)) if dist.is_master() and tb_logger is not None: for i in range(len(val_loss)): tb_logger.add_scalar('loss_val@{}'.format(val_width[i]), val_loss[i], self.cur_step) tb_logger.add_scalar('acc1_val@{}'.format(val_width[i]), val_top1[i], self.cur_step) tb_logger.add_scalar('acc5_val@{}'.format(val_width[i]), val_top5[i], self.cur_step)