def quantization_aware_training_example(train_loader, test_loader, device):
    model = NaiveModel()

    configure_list = [{
        'quant_types': ['weight', 'output'],
        'quant_bits': {
            'weight': 8,
            'output': 8
        },
        'op_names': ['conv1']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8
        },
        'op_names': ['relu1']
    }, {
        'quant_types': ['weight', 'output'],
        'quant_bits': {
            'weight': 8,
            'output': 8
        },
        'op_names': ['conv2']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8
        },
        'op_names': ['relu2']
    }]

    # finetune the model by using QAT
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, configure_list, optimizer)
    quantizer.compress()

    model.to(device)
    for epoch in range(1):
        print('# Epoch {} #'.format(epoch))
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)

    model_path = "mnist_model.pth"
    calibration_path = "mnist_calibration.pth"
    calibration_config = quantizer.export_model(model_path, calibration_path)

    test(model, device, test_loader)

    print("calibration_config: ", calibration_config)

    batch_size = 32
    input_shape = (batch_size, 1, 28, 28)

    engine = ModelSpeedupTensorRT(model,
                                  input_shape,
                                  config=calibration_config,
                                  batchsize=batch_size)
    engine.compress()

    test_trt(engine, test_loader)
Ejemplo n.º 2
0
def main():
    torch.manual_seed(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    trans = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    train_loader = torch.utils.data.DataLoader(datasets.MNIST('data',
                                                              train=True,
                                                              download=True,
                                                              transform=trans),
                                               batch_size=64,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST('data',
                                                             train=False,
                                                             transform=trans),
                                              batch_size=1000,
                                              shuffle=True)

    model = Mnist()
    '''you can change this to DoReFaQuantizer to implement it
    DoReFaQuantizer(configure_list).compress(model)
    '''
    configure_list = [
        {
            'quant_types': ['weight'],
            'quant_bits': {
                'weight': 8,
            },  # you can just use `int` here because all `quan_types` share same bits length, see config for `ReLu6` below.
            'op_types': ['Conv2d', 'Linear']
        },
        {
            'quant_types': ['output'],
            'quant_bits': 8,
            'quant_start_step': 1000,
            'op_types': ['ReLU6']
        }
    ]
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, configure_list, optimizer)
    quantizer.compress()

    model.to(device)
    for epoch in range(40):
        print('# Epoch {} #'.format(epoch))
        train(model, quantizer, device, train_loader, optimizer)
        test(model, device, test_loader)

    model_path = "mnist_model.pth"
    calibration_path = "mnist_calibration.pth"
    onnx_path = "mnist_model.onnx"
    input_shape = (1, 1, 28, 28)
    device = torch.device("cuda")

    calibration_config = quantizer.export_model(model_path, calibration_path,
                                                onnx_path, input_shape, device)
    print("Generated calibration config is: ", calibration_config)
Ejemplo n.º 3
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(args.experiment_data_dir, exist_ok=True)

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=True, download=True, transform=transform),
        batch_size=64,
    )
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        'data', train=False, transform=transform),
                                              batch_size=1000)

    # Step1. Model Pretraining
    model = NaiveModel().to(device)
    criterion = torch.nn.NLLLoss()
    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
    flops, params, _ = count_flops_params(model, (1, 1, 28, 28), verbose=False)

    if args.pretrained_model_dir is None:
        args.pretrained_model_dir = os.path.join(args.experiment_data_dir,
                                                 f'pretrained.pth')

        best_acc = 0
        for epoch in range(args.pretrain_epochs):
            train(args, model, device, train_loader, criterion, optimizer,
                  epoch)
            scheduler.step()
            acc = test(args, model, device, criterion, test_loader)
            if acc > best_acc:
                best_acc = acc
                state_dict = model.state_dict()

        model.load_state_dict(state_dict)
        torch.save(state_dict, args.pretrained_model_dir)
        print(f'Model saved to {args.pretrained_model_dir}')
    else:
        state_dict = torch.load(args.pretrained_model_dir)
        model.load_state_dict(state_dict)
        best_acc = test(args, model, device, criterion, test_loader)

    dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
    time_cost = get_model_time_cost(model, dummy_input)

    # 125.49 M, 0.85M, 93.29, 1.1012
    print(
        f'Pretrained model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}'
    )

    # Step2. Model Pruning
    config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}]

    kw_args = {}
    if args.dependency_aware:
        dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
        print('Enable the dependency_aware mode')
        # note that, not all pruners support the dependency_aware mode
        kw_args['dependency_aware'] = True
        kw_args['dummy_input'] = dummy_input

    pruner = L1FilterPruner(model, config_list, **kw_args)
    model = pruner.compress()
    pruner.get_pruned_weights()

    mask_path = os.path.join(args.experiment_data_dir, 'mask.pth')
    model_path = os.path.join(args.experiment_data_dir, 'pruned.pth')
    pruner.export_model(model_path=model_path, mask_path=mask_path)
    pruner._unwrap_model()  # unwrap all modules to normal state

    # Step3. Model Speedup
    m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
    m_speedup.speedup_model()
    print('model after speedup', model)

    flops, params, _ = count_flops_params(model, dummy_input, verbose=False)
    acc = test(args, model, device, criterion, test_loader)
    time_cost = get_model_time_cost(model, dummy_input)
    print(
        f'Pruned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {acc: .2f}, Time Cost: {time_cost}'
    )

    # Step4. Model Finetuning
    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    best_acc = 0
    for epoch in range(args.finetune_epochs):
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        acc = test(args, model, device, criterion, test_loader)
        if acc > best_acc:
            best_acc = acc
            state_dict = model.state_dict()

    model.load_state_dict(state_dict)
    save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth')
    torch.save(state_dict, save_path)

    flops, params, _ = count_flops_params(model, dummy_input, verbose=True)
    time_cost = get_model_time_cost(model, dummy_input)

    # FLOPs 28.48 M, #Params: 0.18M, Accuracy:  89.03, Time Cost: 1.03
    print(
        f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}'
    )
    print(f'Model saved to {save_path}')

    # Step5. Model Quantization via QAT
    config_list = [{
        'quant_types': ['weight', 'output'],
        'quant_bits': {
            'weight': 8,
            'output': 8
        },
        'op_names': ['conv1']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8
        },
        'op_names': ['relu1']
    }, {
        'quant_types': ['weight', 'output'],
        'quant_bits': {
            'weight': 8,
            'output': 8
        },
        'op_names': ['conv2']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8
        },
        'op_names': ['relu2']
    }]

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, config_list, optimizer)
    quantizer.compress()

    # Step6. Quantization Aware Training
    best_acc = 0
    for epoch in range(1):
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        acc = test(args, model, device, criterion, test_loader)
        if acc > best_acc:
            best_acc = acc
            state_dict = model.state_dict()

    calibration_path = os.path.join(args.experiment_data_dir,
                                    'calibration.pth')
    calibration_config = quantizer.export_model(model_path, calibration_path)
    print("calibration_config: ", calibration_config)

    # Step7. Model Speedup
    batch_size = 32
    input_shape = (batch_size, 1, 28, 28)
    engine = ModelSpeedupTensorRT(model,
                                  input_shape,
                                  config=calibration_config,
                                  batchsize=32)
    engine.compress()

    test_trt(engine, test_loader)
Ejemplo n.º 4
0
    'quant_types': ['input', 'weight'],
    'quant_bits': {
        'input': 8,
        'weight': 8
    },
    'op_names': ['fc1', 'fc2']
}]

model = TorchModel().to(device)
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
criterion = F.nll_loss
dummy_input = torch.rand(32, 1, 28, 28).to(device)

from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
quantizer.compress()

# %%
# finetuning the model by using QAT
for epoch in range(3):
    trainer(model, optimizer, criterion)
    evaluator(model)

# %%
# export model and get calibration_config
import os
os.makedirs('log', exist_ok=True)
model_path = "./log/mnist_model.pth"
calibration_path = "./log/mnist_calibration.pth"
calibration_config = quantizer.export_model(model_path, calibration_path)
Ejemplo n.º 5
0
    def test_qat_quantization_speedup(self):
        model = BackboneModel()

        configure_list = [{
            'quant_types': ['input', 'weight'],
            'quant_bits': {
                'input': 8,
                'weight': 8
            },
            'op_names': ['conv1']
        }, {
            'quant_types': ['output'],
            'quant_bits': {
                'output': 8
            },
            'op_names': ['relu1']
        }, {
            'quant_types': ['input', 'weight'],
            'quant_bits': {
                'input': 8,
                'weight': 8
            },
            'op_names': ['conv2']
        }, {
            'quant_types': ['output'],
            'quant_bits': {
                'output': 8
            },
            'op_names': ['relu2']
        }]

        # finetune the model by using QAT
        dummy_input = torch.randn(1, 1, 28, 28)
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
        quantizer = QAT_Quantizer(model, configure_list, optimizer,
                                  dummy_input)
        quantizer.compress()

        model.to(self.device)
        for epoch in range(1):
            print('# Epoch {} #'.format(epoch))
            self._train(model, optimizer)
            self._test(model)

        model_path = "mnist_model.pth"
        calibration_path = "mnist_calibration.pth"
        calibration_config = quantizer.export_model(model_path,
                                                    calibration_path)

        self._test(model)

        print("calibration_config: ", calibration_config)

        batch_size = 32
        input_shape = (batch_size, 1, 28, 28)

        engine = ModelSpeedupTensorRT(model,
                                      input_shape,
                                      config=calibration_config,
                                      batchsize=batch_size)
        engine.compress()

        self._test_trt(engine)

        os.remove(model_path)
        os.remove(calibration_path)
Ejemplo n.º 6
0
def main():
    torch.manual_seed(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    trans = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    train_loader = torch.utils.data.DataLoader(datasets.MNIST('data',
                                                              train=True,
                                                              download=True,
                                                              transform=trans),
                                               batch_size=64,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST('data',
                                                             train=False,
                                                             transform=trans),
                                              batch_size=1000,
                                              shuffle=True)

    # Two things should be kept in mind when set this configure_list:
    # 1. When deploying model on backend, some layers will be fused into one layer. For example, the consecutive
    # conv + bn + relu layers will be fused into one big layer. If we want to execute the big layer in quantization
    # mode, we should tell the backend the quantization information of the input, output, and the weight tensor of
    # the big layer, which correspond to conv's input, conv's weight and relu's output.
    # 2. Same tensor should be quantized only once. For example, if a tensor is the output of layer A and the input
    # of the layer B, you should configure either {'quant_types': ['output'], 'op_names': ['a']} or
    # {'quant_types': ['input'], 'op_names': ['b']} in the configure_list.

    configure_list = [{
        'quant_types': ['weight', 'input'],
        'quant_bits': {
            'weight': 8,
            'input': 8
        },
        'op_names': ['conv1', 'conv2']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8,
        },
        'op_names': ['relu1', 'relu2']
    }, {
        'quant_types': ['output', 'weight', 'input'],
        'quant_bits': {
            'output': 8,
            'weight': 8,
            'input': 8
        },
        'op_names': ['fc1', 'fc2'],
    }]

    # you can also set the quantization dtype and scheme layer-wise through configure_list like:
    # configure_list = [{
    #         'quant_types': ['weight', 'input'],
    #         'quant_bits': {'weight': 8, 'input': 8},
    #         'op_names': ['conv1', 'conv2'],
    #         'quant_dtype': 'int',
    #         'quant_scheme': 'per_channel_symmetric'
    #       }]
    # For now quant_dtype's options are 'int' and 'uint. And quant_scheme's options are per_tensor_affine,
    # per_tensor_symmetric, per_channel_affine and per_channel_symmetric.
    set_quant_scheme_dtype('weight', 'per_channel_symmetric', 'int')
    set_quant_scheme_dtype('output', 'per_tensor_symmetric', 'int')
    set_quant_scheme_dtype('input', 'per_tensor_symmetric', 'int')

    model = NaiveModel().to(device)
    dummy_input = torch.randn(1, 1, 28, 28).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    # To enable batch normalization folding in the training process, you should
    # pass dummy_input to the QAT_Quantizer.
    quantizer = QAT_Quantizer(model,
                              configure_list,
                              optimizer,
                              dummy_input=dummy_input)
    quantizer.compress()

    model.to(device)
    for epoch in range(40):
        print('# Epoch {} #'.format(epoch))
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)

    model_path = "mnist_model.pth"
    calibration_path = "mnist_calibration.pth"
    onnx_path = "mnist_model.onnx"
    input_shape = (1, 1, 28, 28)
    device = torch.device("cuda")

    calibration_config = quantizer.export_model(model_path, calibration_path,
                                                onnx_path, input_shape, device)
    print("Generated calibration config is: ", calibration_config)