Ejemplo n.º 1
0
    def _train_one_batch(self, x, y, optimizer, lr_scheduler, meters, criterions, end):
        top1_meter, top5_meter, loss_meter, data_time = meters
        criterion = criterions[0]
        world_size = dist.get_world_size()

        lr_scheduler.step(self.cur_step)
        self.cur_step += 1
        data_time.update(time.time() - end)

        self.model.zero_grad()
        out = self.model(x)
        loss = criterion(out, y)
        loss /= world_size

        top1, top5 = accuracy(out, y, top_k=(1, 5))
        reduced_loss = dist.all_reduce(loss.clone())
        reduced_top1 = dist.all_reduce(top1.clone(), div=True)
        reduced_top5 = dist.all_reduce(top5.clone(), div=True)

        loss_meter.update(reduced_loss.item())
        top1_meter.update(reduced_top1.item())
        top5_meter.update(reduced_top5.item())

        loss.backward()
        dist.average_gradient(self.model.parameters())
        optimizer.step()
Ejemplo n.º 2
0
def validate(model, loss_func, val_loader):
    losses = AverageMeter('Loss', ':4.4f')
    top1 = AverageMeter('Acc@1', ':4.2f')
    top5 = AverageMeter('Acc@5', ':4.2f')
    progress = ProgressMeter(len(val_loader), [losses, top1, top5],
                             prefix='Validation: ')

    model.eval()

    with torch.no_grad():
        for i, data in enumerate(val_loader, 1):
            imgs = data['image'].to(device)
            label = data['label'].to(device)

            out = model(imgs)
            loss = loss_func(out, label)

            acc1, acc5 = accuracy(out, label, topk=(1, 5))
            losses.update(loss.item())
            top1.update(acc1[0])
            top5.update(acc5[0])

        progress.display(i)

    return losses.avg
Ejemplo n.º 3
0
def train(model, optimizer, loss_func, train_loader, epoch):
    losses = AverageMeter('Loss', ':4.4f')
    top1 = AverageMeter('Acc@1', ':4.2f')
    top5 = AverageMeter('Acc@5', ':4.2f')
    progress = ProgressMeter(len(train_loader), [losses, top1, top5],
                             prefix="Epoch: [{}]".format(epoch + 1))

    model.train()

    for i, data in enumerate(train_loader, 1):

        imgs = data['image'].to(device)
        label = data['label'].to(device)

        out = model(imgs)
        loss = loss_func(out, label)

        acc1, acc5 = accuracy(out, label, topk=(1, 5))
        losses.update(loss.item())
        top1.update(acc1[0])
        top5.update(acc5[0])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % PRINT_FREQ == 0:
            progress.display(i)

    timer = time() - TIME
    print("Total time Elapsed (H:m:s):", timedelta(seconds=timer))
Ejemplo n.º 4
0
    def validate(self, val_loader, tb_logger=None):
        batch_time = AverageMeter(0)
        loss_meter = AverageMeter(0)
        top1_meter = AverageMeter(0)
        top5_meter = AverageMeter(0)

        self.model.eval()
        criterion = nn.CrossEntropyLoss()
        end = time.time()

        with torch.no_grad():
            for batch_idx, (x, y) in enumerate(val_loader):
                x, y = x.cuda(), y.cuda()
                num = x.size(0)

                out = self.model(x)
                loss = criterion(out, y)
                top1, top5 = accuracy(out, y, top_k=(1, 5))

                loss_meter.update(loss.item(), num)
                top1_meter.update(top1.item(), num)
                top5_meter.update(top5.item(), num)

                batch_time.update(time.time() - end)
                end = time.time()

                if batch_idx % self.config.logging.print_freq == 0:
                    self._info(
                        'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})'
                        .format(batch_idx,
                                len(val_loader),
                                batch_time=batch_time))

        total_num = torch.tensor([loss_meter.count]).cuda()
        loss_sum = torch.tensor([loss_meter.avg * loss_meter.count]).cuda()
        top1_sum = torch.tensor([top1_meter.avg * top1_meter.count]).cuda()
        top5_sum = torch.tensor([top5_meter.avg * top5_meter.count]).cuda()

        dist.all_reduce(total_num)
        dist.all_reduce(loss_sum)
        dist.all_reduce(top1_sum)
        dist.all_reduce(top5_sum)

        val_loss = loss_sum.item() / total_num.item()
        val_top1 = top1_sum.item() / total_num.item()
        val_top5 = top5_sum.item() / total_num.item()

        self._info(
            'Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'.format(
                val_top1, val_top5, val_loss, loss_meter.count))

        if dist.is_master():
            if val_top1 > self.best_top1:
                self.best_top1 = val_top1

            if tb_logger is not None:
                tb_logger.add_scalar('loss_val', val_loss, self.cur_step)
                tb_logger.add_scalar('acc1_val', val_top1, self.cur_step)
                tb_logger.add_scalar('acc5_val', val_top5, self.cur_step)
Ejemplo n.º 5
0
    def _train_one_batch(self, x, y, optimizer, lr_scheduler, meters,
                         criterions, end):
        top1_meter, top5_meter, loss_meter, data_time = meters
        criterion, distill_loss = criterions
        world_size = dist.get_world_size()
        max_width = self.config.training.sandwich.max_width

        lr_scheduler.step(self.cur_step)
        self.cur_step += 1
        data_time.update(time.time() - end)

        self.model.zero_grad()

        max_pred = None
        for idx in range(self.config.training.sandwich.num_sample):
            # sandwich rule
            top1_m, top5_m, loss_m = self._set_width(idx, top1_meter,
                                                     top5_meter, loss_meter)

            out = self.model(x)
            if self.config.training.distillation.enable:
                if idx == 0:
                    max_pred = out.detach()
                    loss = criterion(out, y)
                else:
                    loss = self.config.training.distillation.loss_weight * \
                           distill_loss(out, max_pred)
                    if self.config.training.distillation.hard_label:
                        loss += criterion(out, y)
            else:
                loss = criterion(out, y)
            loss /= world_size

            top1, top5 = accuracy(out, y, top_k=(1, 5))
            reduced_loss = dist.all_reduce(loss.clone())
            reduced_top1 = dist.all_reduce(top1.clone(), div=True)
            reduced_top5 = dist.all_reduce(top5.clone(), div=True)

            loss_m.update(reduced_loss.item())
            top1_m.update(reduced_top1.item())
            top5_m.update(reduced_top5.item())

            loss.backward()

        dist.average_gradient(self.model.parameters())
        optimizer.step()
Ejemplo n.º 6
0
    def _train_one_batch(self, x, y, optimizer, lr_scheduler, meters,
                         criterions, end):
        lr_scheduler, arch_lr_scheduler = lr_scheduler
        optimizer, arch_optimizer = optimizer
        top1_meter, top5_meter, loss_meter, arch_loss_meter, \
            floss_meter, eflops_meter, arch_top1_meter, data_time = meters
        criterion, _ = criterions

        self.model.module.set_alpha_training(False)
        super(DMCPRunner, self)._train_one_batch(
            x, y, optimizer, lr_scheduler,
            [top1_meter, top5_meter, loss_meter, data_time], criterions, end)

        arch_lr_scheduler.step(self.cur_step)
        world_size = dist.get_world_size()

        # train architecture params
        if self.cur_step >= self.config.arch.start_train \
                and self.cur_step % self.config.arch.train_freq == 0:
            self._set_width(0, top1_meter, top5_meter, loss_meter)
            self.model.module.set_alpha_training(True)

            self.model.zero_grad()
            arch_out = self.model(x)
            arch_loss = criterion(arch_out, y)
            arch_loss /= world_size
            floss, eflops = flop_loss(self.config, self.model)
            floss /= world_size

            arch_top1 = accuracy(arch_out, y, top_k=(1, ))[0]
            reduced_arch_loss = dist.all_reduce(arch_loss.clone())
            reduced_floss = dist.all_reduce(floss.clone())
            reduced_eflops = dist.all_reduce(eflops.clone(), div=True)
            reduced_arch_top1 = dist.all_reduce(arch_top1.clone(), div=True)

            arch_loss_meter.update(reduced_arch_loss.item())
            floss_meter.update(reduced_floss.item())
            eflops_meter.update(reduced_eflops.item())
            arch_top1_meter.update(reduced_arch_top1.item())

            floss.backward()
            arch_loss.backward()
            dist.average_gradient(self.model.module.arch_parameters())
            arch_optimizer.step()
Ejemplo n.º 7
0
    def guided_train(summary_writer, log_per_epoch=100, print_freq=20):

        batch_time = AverageMeter()
        data_time = AverageMeter()

        low_prec_losses = AverageMeter()
        low_prec_top1 = AverageMeter()
        low_prec_top5 = AverageMeter()
        distance_meter = AverageMeter()

        # 状态转化为训练
        low_prec_model.train()
        full_prec_model.eval()

        end = time.time()

        # 用于控制 tensorboard 的显示频率
        interval = len(train_loader) // log_per_epoch
        summary_point = [
            interval * split for split in torch.arange(log_per_epoch)
        ]

        for i, (input, target) in enumerate(train_loader):
            # measure checkpoint.pth data loading time
            data_time.update(time.time() - end)

            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)

            # target 必须要转为 cuda 类型
            # If ``True`` and the source is in pinned memory(固定内存),
            # the copy will be asynchronous(异步) with respect to the host
            target = target.cuda(args.gpu, non_blocking=True)

            full_prec_feature_map1.clear()
            low_prec_feature_map1.clear()
            full_prec_feature_map2.clear()
            low_prec_feature_map2.clear()

            # compute low_pre_output
            low_pre_output = low_prec_model(input)
            full_pre_output = full_prec_model(input)
            """Guided Key Point start"""

            # 将 distance 和 feature map放在同一个一gpu上
            distance = torch.tensor([0.0]).cuda(args.gpu, non_blocking=True)
            num_layer3_features = 1
            for dim in full_prec_feature_map1[0].size():
                num_layer3_features *= dim

            num_layer4_features = 1
            for dim in full_prec_feature_map2[0].size():
                num_layer4_features *= dim

            for cudaid in full_prec_feature_map1:
                # 手动将feature map都搬到同一个 GPU 上
                full_prec_feature_map1[cudaid] = full_prec_feature_map1[
                    cudaid].cuda(args.gpu, non_blocking=True)
                low_prec_feature_map1[cudaid] = low_prec_feature_map1[
                    cudaid].cuda(args.gpu, non_blocking=True)
                full_prec_feature_map2[cudaid] = full_prec_feature_map2[
                    cudaid].cuda(args.gpu, non_blocking=True)
                low_prec_feature_map2[cudaid] = low_prec_feature_map2[
                    cudaid].cuda(args.gpu, non_blocking=True)

            for cudaid in low_prec_feature_map1:
                """
                RuntimeError: arguments are located on different GPUs
                解决方法在于手动将feature map都搬到同一个 GPU 上
                """
                layer3 = (
                    quantize_activations_gemm(low_prec_feature_map1[cudaid]) -
                    quantize_activations_gemm(full_prec_feature_map1[cudaid])
                ).norm(p=args.norm) / num_layer3_features
                layer4 = (
                    quantize_activations_gemm(low_prec_feature_map2[cudaid]) -
                    quantize_activations_gemm(full_prec_feature_map2[cudaid])
                ).norm(p=args.norm) / num_layer4_features
                distance += (layer3 + layer4) / len(low_prec_feature_map1)

            distance *= args.balance
            """Guided Key Point end"""

            low_prec_loss = criterion(low_pre_output, target)
            low_prec_prec1, low_prec_prec5 = accuracy(low_pre_output,
                                                      target,
                                                      topk=(1, 5))

            low_prec_losses.update(low_prec_loss.item(), input.size(0))
            low_prec_top1.update(low_prec_prec1[0], input.size(0))
            low_prec_top5.update(low_prec_prec5[0], input.size(0))
            distance_meter.update(distance[0], 1)

            # compute gradient and do SGD step
            low_prec_optimizer.zero_grad()
            low_prec_loss.backward()
            low_prec_optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:

                print(
                    'Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Loss {low_prec_loss.val:.4f} ({low_prec_loss.avg:.4f})\t'
                    'Prec@1 {low_prec_top1.val:.3f} ({low_prec_top1.avg:.3f})\t'
                    'Prec@5 {low_prec_top5.val:.3f} ({low_prec_top5.avg:.3f}) \t'
                    'distance {distance.val:.3f} ({distance.avg:.3f})'.format(
                        epoch,
                        i,
                        len(train_loader),
                        batch_time=batch_time,
                        data_time=data_time,
                        low_prec_loss=low_prec_losses,
                        low_prec_top1=low_prec_top1,
                        low_prec_top5=low_prec_top5,
                        distance=distance_meter))

            if summary_writer is not None and (i in summary_point):
                step = i / interval + (epoch - 1) * log_per_epoch
                summary_writer.add_scalar("distance", distance_meter.avg, step)
                summary_writer.add_scalar("loss/low_prec_loss", low_prec_loss,
                                          step)
                summary_writer.add_scalar("train_low_prec/top-1",
                                          low_prec_top1.avg, step)
                summary_writer.add_scalar("train_low_prec/top-5",
                                          low_prec_top5.avg, step)
Ejemplo n.º 8
0
def validate(model, val_loader, criterion, gpu=0, epoch=0, summary_writer=None, name_prefix=None, print_freq=20):

    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    loss_name = "val/loss"
    prec1_name = "val/top-1"
    prec5_name = "val/top-5"

    if name_prefix is not None:
        name_prefix = ''.join((name_prefix, '-'))
        loss_name = ''.join((name_prefix, loss_name))
        prec1_name = ''.join((name_prefix, prec1_name))
        prec5_name = ''.join((name_prefix, prec5_name))

    # 进入 eval 状态
    model.eval()

    # if not full_precision:
    #     qw = QuantizeWeightOrActivation()  # 1, 创建量化器
    #     model.apply(qw.quantize_tanh)  # 2, 量化权重, 保存全精度权重和量化梯度

    with torch.no_grad():
        start = time.time()
        for i, (data, target) in enumerate(val_loader):
            if gpu is not None:
                data = data.cuda(gpu, non_blocking=True)

            # batch_size 128时, target size 为 torch.Size([128])
            target = target.cuda(gpu, non_blocking=True)
            output = model(data)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), data.size(0))
            top1.update(prec1[0], data.size(0))
            top5.update(prec5[0], data.size(0))

            # measure elapsed time
            batch_time.update(time.time() - start)
            start = time.time()

            if i % print_freq == 0:
                print('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                       i, len(val_loader), batch_time=batch_time,
                       loss=losses, top1=top1, top5=top5))

        if summary_writer is not None:
            summary_writer.add_scalar(loss_name, losses.avg, epoch)
            summary_writer.add_scalar(prec1_name, top1.avg, epoch)
            summary_writer.add_scalar(prec5_name, top5.avg, epoch)

        print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))

    # if not full_precision:
    #     model.apply(qw.restore)  # 第3步, 恢复全精度权重

    return top1.avg
Ejemplo n.º 9
0
def train(model, train_loader, criterion, optimizer, gpu, epoch=0,
          summary_writer=None, log_per_epoch=100, print_freq=30):

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    # if not full_precision:
    #     qw = QuantizeWeightOrActivation()   # 第一步, 创建量化器
    end = time.time()

    # 用于控制 tensorboard 的显示频率
    interval = len(train_loader) // log_per_epoch
    summary_point = [interval * split for split in torch.arange(log_per_epoch)]

    for i, (data, target) in enumerate(train_loader):
        data_time.update(time.time() - end)  # measure checkpoint.pth data loading time

        if gpu is not None:
            data = data.cuda(gpu, non_blocking=True)
        target = target.cuda(gpu, non_blocking=True)

        # if not full_precision:
        #     model.apply(qw.quantize_tanh)  # 第二步, 量化权重, 保存全精度权重和量化梯度

        output = model(data)
        loss = criterion(output, target)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), data.size(0))
        top1.update(prec1[0], data.size(0))
        top5.update(prec5[0], data.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()

        # if not full_precision:
        #     model.apply(qw.restore)  # 第三步, 反向传播后, 模型梯度计算后, 恢复全精度权重
        #     model.apply(qw.update_grad)  # 第四步, 使用之前存储的量化梯度乘上反向传播的梯度

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # 控制台
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5))

        if summary_writer and (i in summary_point):
            step = i//interval + epoch * log_per_epoch
            summary_writer.add_scalar("loss/train_loss", loss, step)
            summary_writer.add_scalar("train/top-1", top1.avg, step)
            summary_writer.add_scalar("train/top-5", top5.avg, step)
Ejemplo n.º 10
0
    def validate(self,
                 val_loader,
                 train_loader=None,
                 val_width=None,
                 tb_logger=None):
        assert train_loader is not None
        assert val_width is not None

        batch_time = AverageMeter(0)
        loss_meter = [AverageMeter(0) for _ in range(len(val_width))]
        top1_meter = [AverageMeter(0) for _ in range(len(val_width))]
        top5_meter = [AverageMeter(0) for _ in range(len(val_width))]
        val_loss, val_top1, val_top5 = [], [], []

        # switch to evaluate mode
        self.model.eval()

        criterion = nn.CrossEntropyLoss()
        end = time.time()

        with torch.no_grad():
            for idx, width in enumerate(val_width):
                top1_m, top5_m, loss_m = self._set_width(idx,
                                                         top1_meter,
                                                         top5_meter,
                                                         loss_meter,
                                                         width=width)

                self._info('-' * 80)
                self._info('Evaluating [{}/{}]@{}'.format(
                    idx + 1, len(val_width), width))

                self.calibrate(train_loader)
                for j, (x, y) in enumerate(val_loader):
                    x, y = x.cuda(), y.cuda()
                    num = x.size(0)

                    out = self.model(x)
                    loss = criterion(out, y)
                    top1, top5 = accuracy(out.data, y, top_k=(1, 5))

                    loss_m.update(loss.item(), num)
                    top1_m.update(top1.item(), num)
                    top5_m.update(top5.item(), num)

                    # measure elapsed time
                    batch_time.update(time.time() - end)
                    end = time.time()

                    if j % self.config.logging.print_freq == 0:
                        self._info(
                            'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})'
                            .format(j, len(val_loader), batch_time=batch_time))

                total_num = torch.tensor([loss_m.count]).cuda()
                loss_sum = torch.tensor([loss_m.avg * loss_m.count]).cuda()
                top1_sum = torch.tensor([top1_m.avg * top1_m.count]).cuda()
                top5_sum = torch.tensor([top5_m.avg * top5_m.count]).cuda()

                dist.all_reduce(total_num)
                dist.all_reduce(loss_sum)
                dist.all_reduce(top1_sum)
                dist.all_reduce(top5_sum)

                val_loss.append(loss_sum.item() / total_num.item())
                val_top1.append(top1_sum.item() / total_num.item())
                val_top5.append(top5_sum.item() / total_num.item())

                self._info(
                    'Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'.
                    format(val_top1[-1], val_top5[-1], val_loss[-1],
                           loss_m.count))

            if dist.is_master() and tb_logger is not None:
                for i in range(len(val_loss)):
                    tb_logger.add_scalar('loss_val@{}'.format(val_width[i]),
                                         val_loss[i], self.cur_step)
                    tb_logger.add_scalar('acc1_val@{}'.format(val_width[i]),
                                         val_top1[i], self.cur_step)
                    tb_logger.add_scalar('acc5_val@{}'.format(val_width[i]),
                                         val_top5[i], self.cur_step)