Exemple #1
0
def main():
    val_loader = load_val_data(data_dir="", batch_size=200)
    criterion = torch.nn.CrossEntropyLoss().cuda()
    model = models.densenet121(pretrained=True)

    model = torch.nn.DataParallel(model)
    model.cuda()
    print(model)
    model.eval()
    validate(model, val_loader, criterion)
Exemple #2
0
def main():
    torch.set_printoptions(precision=16)
    epsilon = 1e-5
    data_set = "cifar10"
    criterion = torch.nn.CrossEntropyLoss().cuda()
    val_loader = cifar_val_data(data_set, 50, 8)

    model, model2 = load_resnet(data_set, "resnet34")

    model.eval()
    validate(model, val_loader, criterion)
    model2.eval()
    validate(model2, val_loader, criterion)

    return
Exemple #3
0
    bn1.weight
    bn1.bias
    bn1.running_mean
    bn1.running_var
    layer1.0.conv1.weight
    layer1.0.bn1.weight
    layer1.0.bn1.bias
    layer1.0.bn1.running_mean
    layer1.0.bn1.running_var
"""

# print("bn1.weight: \n", len(state_dict["bn1.weight"]), state_dict["bn1.weight"])
# print("bn1.bias: \n", len(state_dict["bn1.bias"]), state_dict["bn1.bias"])
# print("bn1.running_mean: \n", state_dict["bn1.running_mean"])
# print("bn1.running_val: \n", state_dict["bn1.running_var"])

val_loader = load_val_data(data)
evaluate = merge_model_name
if os.path.isfile(evaluate):
    print("Loading evaluate model '{}'".format(evaluate))
    checkpoint = torch.load(evaluate)
    merge_model.load_state_dict(checkpoint)
    print("Loaded evaluate model '{}'".format(evaluate))
else:
    print("No evaluate mode found at '{}'".format(evaluate))

merge_model.cuda()
merge_model.eval()
criterion = torch.nn.CrossEntropyLoss().cuda()
validate(merge_model, val_loader, criterion)
Exemple #4
0
def main():
    global best_prec1
    print("\n"
          "=> arch         {: <20}\n"
          "=> init_lr      {: <20}\n"
          "=> lr-step      {: <20}\n"
          "=> momentum     {: <20}\n"
          "=> weight-decay {: <20}\n"
          "=> batch-size   {: <20}\n"
          "=> balance      {: <20}\n"
          "=> save-dir     {: <20}\n".format(
           args.arch, args.lr, args.lr_step, args.momentum, args.weight_decay,
           args.batch_size, args.balance, args.save_dir))

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably!, You may see unexpected behavior'
                      ' when restarting from checkpoints.')

    # 下面的 warning 可以看出, 如果指定一个 gpu id, 就不会使用多 gpu 训练
    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU, This will completely disable data parallelism.')

    # 多机器训练而不是一机多卡(集群训练模式)
    args.distributed = args.world_size > 1
    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # 根据训练模式加载训练模型
    if args.mode == 0:
        print("=> training mode {}: full precision training from scratch\n".format(args.mode))
        model = models.__dict__[args.arch]()

    elif args.mode == 1:
        print("=> training mode {}: quantize weight only\n".format(args.mode))
        print("=> loading imageNet pre-trained model {}".format(args.arch))
        model = net_quantize_weight.__dict__[args.arch]()
        model_dict = model.state_dict()
        init_model = models.__dict__[args.arch](pretrained=True)
        model_dict.update(init_model.state_dict())
        model.load_state_dict(model_dict)
        print("=> loaded imageNet pre-trained model {}".format(args.arch))

    elif args.mode == 2:
        print("=> training mode {}: quantize activation using quantized weight\n".format(args.mode))
        model = net_quantize_activation.__dict__[args.arch]()
        if os.path.isfile(args.weight_quantized):
            print("=> loading weight quantized model '{}'".format(args.weight_quantized))
            model_dict = model.state_dict()
            quantized_model = torch.load(args.weight_quantized)
            init_dict = {}
            for k, v in quantized_model['state_dict'].items():
                if k in model.state_dict():
                    if k.find("conv") != -1 or k.find("fc") != -1:
                        init_dict[k[7:]] = quantize_weights_bias_gemm(v)
                    else:
                        init_dict[k[7:]] = v

            model_dict.update(init_dict)
            model.load_state_dict(model_dict)
            print("=> loaded weight_quantized '{}'".format(args.weight_quantized))
        else:
            warnings.warn("=> no weight quantized model found at '{}'".format(args.weight_quantized))
            return

    elif args.mode == 3:
        print("=> training mode {}: quantize weight and activation simultaneously\n".format(args.mode))
        print("=> loading imageNet pre-trained model '{}'".format(args.arch))
        # 使用预训练的ResNet18来初始化同时量化网络权重和激活
        model = net_quantize_activation.__dict__[args.arch]()
        # 获取预训练模型参数
        model_dict = model.state_dict()
        init_model = models.__dict__[args.arch](pretrained=True)
        init_dict = {k: v for k, v in init_model.state_dict().items() if k in model_dict}
        model_dict.update(init_dict)
        model.load_state_dict(model_dict)

    elif args.mode == 4:
        print("=> Training mode {}: guided quantize weight and activation "
              "from pre-trained imageNet model {}\n ".format(args.mode, args.arch))

        # quantize_guided.guided(args)
        quantize_guided.guided(args)
        return
    else:
        raise Exception("invalid mode, valid mode is 0~4!!")

    if args.gpu is not None:  # 指定GPU
        model = model.cuda(args.gpu)
    elif args.distributed:  # 集群训练(多机器)
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:  # 单机训练(单卡或者多卡)
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            # 一机多卡时, 多 GPU 训练, 指定要用到 GPU 的 ids
            """
               list(model.state_dict().keys())[0]
               model 在使用 torch.nn.DataParallel 之前每层的名字, 如 conv1.weight
               model 在使用 torch.nn.DataParallel 之后每层的名字, 如 module.conv1.weight
               如果训练使用并行化, 而验证使用指定GPU的话就会出现问题, 所以需要在指定GPU代码中,添加解决冲突的代码
            """
            model = torch.nn.DataParallel(model, args.device_ids).cuda()

    criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu)
    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr,  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_step)

    # optionally resume from a checkpoint
    if args.resume:
        print("\n=> resume training from checkpoint")
        checkpoint_filename = os.path.join(args.save_dir, "checkpoint.pth.tar")

        if os.path.isfile(checkpoint_filename):
            print("=> loading checkpoint '{}'".format(checkpoint_filename))
            checkpoint = torch.load(checkpoint_filename)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(checkpoint_filename, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(checkpoint_filename))

    cudnn.benchmark = True

    val_loader = load_val_data(args.data, args.batch_size, args.workers)

    if args.evaluate:
        if os.path.isfile(args.evaluate):
            print("Loading evaluate model '{}'".format(args.evaluate))
            checkpoint = torch.load(args.evaluate)
            if "state_dict" in checkpoint.keys():
                model.load_state_dict(checkpoint['state_dict'])
                print("epoch: {} ".format(checkpoint['epoch']))
            else:
                checkpoint = {''.join(("module.", k)): v for k, v in checkpoint.items() if not k.startswith("module")}
                model.load_state_dict(checkpoint)
            print("Loaded evaluate model '{}'".format(args.evaluate))
        else:
            print("No evaluate mode found at '{}'".format(args.evaluate))
            return
        validate(model, val_loader, criterion, args.gpu)
        return

    train_loader, train_sampler = load_train_data(args.data, args.batch_size, args.workers, args.distributed)

    summary_writer = SummaryWriter(args.save_dir)
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        lr_scheduler.step()

        # train for one epoch
        train(model, train_loader, criterion, optimizer, args.gpu, epoch, summary_writer)

        # evaluate on validation set
        prec1 = validate(model, val_loader, criterion, args.gpu, epoch, summary_writer)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch+1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer': optimizer.state_dict(),
        }, is_best, args.save_dir)

    summary_writer.close()
Exemple #5
0
def guided(args):
    best_low_prec1 = 0
    full_prec_feature_map1 = defaultdict(torch.Tensor)
    full_prec_feature_map2 = defaultdict(torch.Tensor)
    low_prec_feature_map1 = defaultdict(torch.Tensor)
    low_prec_feature_map2 = defaultdict(torch.Tensor)

    def full_prec_hook(module, input, output):
        # 一定要写成 input[0].data.clone()
        # 而不能写成 input[0].clone(), 否则报错
        # RuntimeError: Trying to backward through the graph a second time,
        # but the buffers have already been freed. Specify retain_graph=True
        # when calling backward the first time
        cudaid = int(repr(output.device)[-2])
        full_prec_feature_map1[cudaid] = input[0].data.clone()
        full_prec_feature_map2[cudaid] = output.data.clone()

    def low_prec_hook(module, input, output):
        cudaid = int(repr(output.device)[-2])
        low_prec_feature_map1[cudaid] = input[0].data.clone()
        low_prec_feature_map2[cudaid] = output.data.clone()

    def gpu_config(model):
        if args.gpu is not None:  # 指定GPU
            model = model.cuda(args.gpu)
        elif args.distributed:  # 集群训练(多机器)
            model.cuda()
            model = torch.nn.parallel.DistributedDataParallel(model)

        else:  # 单机训练(单卡或者多卡)
            if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
                model.features = torch.nn.DataParallel(model.features)
                model.cuda()
            else:
                # 一机多卡时, 多 GPU 训练, 指定要用到 GPU 的 ids
                model = torch.nn.DataParallel(model, args.device_ids).cuda()
        return model

    def guided_train(summary_writer, log_per_epoch=100, print_freq=20):

        batch_time = AverageMeter()
        data_time = AverageMeter()

        low_prec_losses = AverageMeter()
        low_prec_top1 = AverageMeter()
        low_prec_top5 = AverageMeter()
        distance_meter = AverageMeter()

        # 状态转化为训练
        low_prec_model.train()
        full_prec_model.eval()

        end = time.time()

        # 用于控制 tensorboard 的显示频率
        interval = len(train_loader) // log_per_epoch
        summary_point = [
            interval * split for split in torch.arange(log_per_epoch)
        ]

        for i, (input, target) in enumerate(train_loader):
            # measure checkpoint.pth data loading time
            data_time.update(time.time() - end)

            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)

            # target 必须要转为 cuda 类型
            # If ``True`` and the source is in pinned memory(固定内存),
            # the copy will be asynchronous(异步) with respect to the host
            target = target.cuda(args.gpu, non_blocking=True)

            full_prec_feature_map1.clear()
            low_prec_feature_map1.clear()
            full_prec_feature_map2.clear()
            low_prec_feature_map2.clear()

            # compute low_pre_output
            low_pre_output = low_prec_model(input)
            full_pre_output = full_prec_model(input)
            """Guided Key Point start"""

            # 将 distance 和 feature map放在同一个一gpu上
            distance = torch.tensor([0.0]).cuda(args.gpu, non_blocking=True)
            num_layer3_features = 1
            for dim in full_prec_feature_map1[0].size():
                num_layer3_features *= dim

            num_layer4_features = 1
            for dim in full_prec_feature_map2[0].size():
                num_layer4_features *= dim

            for cudaid in full_prec_feature_map1:
                # 手动将feature map都搬到同一个 GPU 上
                full_prec_feature_map1[cudaid] = full_prec_feature_map1[
                    cudaid].cuda(args.gpu, non_blocking=True)
                low_prec_feature_map1[cudaid] = low_prec_feature_map1[
                    cudaid].cuda(args.gpu, non_blocking=True)
                full_prec_feature_map2[cudaid] = full_prec_feature_map2[
                    cudaid].cuda(args.gpu, non_blocking=True)
                low_prec_feature_map2[cudaid] = low_prec_feature_map2[
                    cudaid].cuda(args.gpu, non_blocking=True)

            for cudaid in low_prec_feature_map1:
                """
                RuntimeError: arguments are located on different GPUs
                解决方法在于手动将feature map都搬到同一个 GPU 上
                """
                layer3 = (
                    quantize_activations_gemm(low_prec_feature_map1[cudaid]) -
                    quantize_activations_gemm(full_prec_feature_map1[cudaid])
                ).norm(p=args.norm) / num_layer3_features
                layer4 = (
                    quantize_activations_gemm(low_prec_feature_map2[cudaid]) -
                    quantize_activations_gemm(full_prec_feature_map2[cudaid])
                ).norm(p=args.norm) / num_layer4_features
                distance += (layer3 + layer4) / len(low_prec_feature_map1)

            distance *= args.balance
            """Guided Key Point end"""

            low_prec_loss = criterion(low_pre_output, target)
            low_prec_prec1, low_prec_prec5 = accuracy(low_pre_output,
                                                      target,
                                                      topk=(1, 5))

            low_prec_losses.update(low_prec_loss.item(), input.size(0))
            low_prec_top1.update(low_prec_prec1[0], input.size(0))
            low_prec_top5.update(low_prec_prec5[0], input.size(0))
            distance_meter.update(distance[0], 1)

            # compute gradient and do SGD step
            low_prec_optimizer.zero_grad()
            low_prec_loss.backward()
            low_prec_optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:

                print(
                    'Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Loss {low_prec_loss.val:.4f} ({low_prec_loss.avg:.4f})\t'
                    'Prec@1 {low_prec_top1.val:.3f} ({low_prec_top1.avg:.3f})\t'
                    'Prec@5 {low_prec_top5.val:.3f} ({low_prec_top5.avg:.3f}) \t'
                    'distance {distance.val:.3f} ({distance.avg:.3f})'.format(
                        epoch,
                        i,
                        len(train_loader),
                        batch_time=batch_time,
                        data_time=data_time,
                        low_prec_loss=low_prec_losses,
                        low_prec_top1=low_prec_top1,
                        low_prec_top5=low_prec_top5,
                        distance=distance_meter))

            if summary_writer is not None and (i in summary_point):
                step = i / interval + (epoch - 1) * log_per_epoch
                summary_writer.add_scalar("distance", distance_meter.avg, step)
                summary_writer.add_scalar("loss/low_prec_loss", low_prec_loss,
                                          step)
                summary_writer.add_scalar("train_low_prec/top-1",
                                          low_prec_top1.avg, step)
                summary_writer.add_scalar("train_low_prec/top-5",
                                          low_prec_top5.avg, step)

    # 代码用于使用预训练的ResNet18来同时量化网络权重和激活
    print("=> using imageNet pre-trained model '{}'".format(args.arch))
    # 获取预训练模型参数
    full_prec_model = models.__dict__[args.arch](pretrained=True)
    low_prec_model = net_quantize_activation.__dict__[args.arch]()

    model_dict = low_prec_model.state_dict()
    imagenet_dict = full_prec_model.state_dict()
    model_dict.update(imagenet_dict)
    low_prec_model.load_state_dict(model_dict)

    low_prec_layer4 = low_prec_model._modules.get("layer4")
    full_prec_layer4 = full_prec_model._modules.get("layer4")

    hook_low_prec = low_prec_layer4.register_forward_hook(low_prec_hook)
    hook_full_prec = full_prec_layer4.register_forward_hook(full_prec_hook)

    low_prec_model = gpu_config(low_prec_model)
    full_prec_model = gpu_config(full_prec_model)

    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    low_prec_optimizer = torch.optim.SGD(low_prec_model.parameters(),
                                         args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

    low_prec_scheduler = torch.optim.lr_scheduler.StepLR(
        low_prec_optimizer, step_size=args.lr_step, gamma=0.1)

    cudnn.benchmark = True

    val_loader = load_val_data(args.data, args.batch_size, args.workers)
    train_loader, train_sampler = load_train_data(args.data, args.batch_size,
                                                  args.workers,
                                                  args.distributed)

    # 加载日志 writer
    writer = SummaryWriter(args.save_dir)

    for epoch in range(args.start_epoch, args.epochs + 1):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        low_prec_scheduler.step()

        # train for one epoch
        guided_train(writer)

        # evaluate on validation set
        low_prec1 = validate(low_prec_model,
                             val_loader,
                             criterion,
                             args.gpu,
                             epoch,
                             writer,
                             name_prefix='low_prec')

        # remember best prec@1 and save low_prec_checkpoint
        is_best_low = low_prec1 > best_low_prec1

        best_low_prec1 = max(low_prec1, best_low_prec1)

        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': low_prec_model.state_dict(),
                'best_prec1': best_low_prec1,
                'optimizer': low_prec_optimizer.state_dict(),
            },
            is_best_low,
            args.save_dir,
            name_prefix="low_prec")

    # 关闭日志 writer
    writer.close()

    # 去掉钩子

    hook_full_prec.remove()
    hook_low_prec.remove()