def test_manual_mode(self): torch.cuda.empty_cache() net = mnist_model.Net() model = net.to(torch.device('cuda')) # Adding wrapper to first convolution layer for module_name, module_ref in model.named_children(): if module_name is 'conv1': quantized_module = QcPostTrainingWrapper( module_ref, weight_bw=8, activation_bw=8, round_mode='nearest', quant_scheme=QuantScheme.post_training_tf) setattr(model, module_name, quantized_module) sim = QuantizationSimModel(model, dummy_input=torch.rand(1, 1, 28, 28).cuda()) # Quantize the untrained MNIST model sim.compute_encodings(self.forward_pass, forward_pass_callback_args=5) # Run some inferences mnist_torch_model.evaluate(model=sim.model, iterations=100, use_cuda=True) # train the model again mnist_model.train(model=sim.model, epochs=1, num_batches=3, batch_callback=check_if_layer_weights_are_updating, use_cuda=True)
def test_collect_inp_out_data_quantsim_model_gpu(self): """ test collect input output data from module """ device_list = [torch.device('cuda:0')] for device in device_list: model = TinyModel().to(device=device) model_input = torch.randn(1, 3, 32, 32).to(device=device) sim = QuantizationSimModel(model, dummy_input=torch.rand( 1, 3, 32, 32).to(device=device)) module_data = utils.ModuleData(model, model.fc) inp, out = module_data.collect_inp_out_data(model_input, collect_input=False, collect_output=True) fc_out = sim.model(model_input) self.assertFalse( np.array_equal(utils.to_numpy(out), utils.to_numpy(fc_out))) module_data = utils.ModuleData(model, model.conv1) inp, out = module_data.collect_inp_out_data(model_input, collect_input=True, collect_output=False) self.assertTrue( np.array_equal(utils.to_numpy(inp), utils.to_numpy(model_input)))
def main(): args = arguments() seed(args) model = DeepLab(backbone='mobilenet', output_stride=16, num_classes=21, sync_bn=False) model.eval() from aimet_torch import batch_norm_fold from aimet_torch import utils args.input_shape = (1, 3, 513, 513) batch_norm_fold.fold_all_batch_norms(model, args.input_shape) utils.replace_modules_of_type1_with_type2(model, torch.nn.ReLU6, torch.nn.ReLU) if args.checkpoint_path: model.load_state_dict(torch.load(args.checkpoint_path)) else: raise ValueError('checkpoint path {} must be specified'.format( args.checkpoint_path)) data_loader_kwargs = {'worker_init_fn': work_init, 'num_workers': 0} train_loader, val_loader, test_loader, num_class = make_data_loader( args, **data_loader_kwargs) eval_func_quant = model_eval(args, val_loader) eval_func = model_eval(args, val_loader) from aimet_common.defs import QuantScheme from aimet_torch.quantsim import QuantizationSimModel if hasattr(args, 'quant_scheme'): if args.quant_scheme == 'range_learning_tf': quant_scheme = QuantScheme.training_range_learning_with_tf_init elif args.quant_scheme == 'range_learning_tfe': quant_scheme = QuantScheme.training_range_learning_with_tf_enhanced_init elif args.quant_scheme == 'tf': quant_scheme = QuantScheme.post_training_tf elif args.quant_scheme == 'tf_enhanced': quant_scheme = QuantScheme.post_training_tf_enhanced else: raise ValueError("Got unrecognized quant_scheme: " + args.quant_scheme) kwargs = { 'quant_scheme': quant_scheme, 'default_param_bw': args.default_param_bw, 'default_output_bw': args.default_output_bw, 'config_file': args.config_file } print(kwargs) sim = QuantizationSimModel(model.cpu(), input_shapes=args.input_shape, **kwargs) sim.compute_encodings(eval_func_quant, (1024, True)) post_quant_top1 = eval_func(sim.model.cuda(), (99999999, True)) print("Post Quant mIoU :", post_quant_top1)
def test_memory_leak_during_quantization_train(self): # First get baseline numbers base_pre_model_load_mark = torch.cuda.memory_allocated() model = models.vgg16(pretrained=True) model = model.to(torch.device('cuda')) base_model_loaded_mark = torch.cuda.memory_allocated() _ = model_train(model=model, epochs=2) base_model_train_mark = torch.cuda.memory_allocated() base_model_train_delta = base_model_train_mark - base_model_loaded_mark print("Usage Report ------") print("Model pre-load = {}".format(base_pre_model_load_mark)) print("Model load = {}".format(base_model_loaded_mark)) print("Model train delta = {}".format(base_model_train_delta)) del model baseline_leaked_mem = torch.cuda.memory_allocated() - base_pre_model_load_mark print("Leaked during train = {}".format(baseline_leaked_mem)) model = models.vgg16(pretrained=True) model = model.to(torch.device('cuda')) base_model_loaded_mark = torch.cuda.memory_allocated() # # # Now use AIMET sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf_enhanced, default_param_bw=8, default_output_bw=4, dummy_input=torch.rand(1, 3, 224, 224).cuda()) sim.compute_encodings(model_eval, forward_pass_callback_args=1) print(sim.model) aimet_model_quantize_mark = torch.cuda.memory_allocated() aimet_model_quantize_delta = aimet_model_quantize_mark - base_model_loaded_mark _ = model_train(model=sim.model, epochs=2, callback=check_if_layer_weights_are_updating) aimet_model_train_mark = torch.cuda.memory_allocated() aimet_model_train_delta = aimet_model_train_mark - aimet_model_quantize_mark leaked_memory = aimet_model_train_delta - base_model_train_delta + baseline_leaked_mem print("") print("Usage Report ------") print("Model load = {}".format(base_model_loaded_mark)) print("AIMET quantize delta = {}".format(aimet_model_quantize_delta)) print("AIMET train delta = {}".format(aimet_model_train_delta)) print("Leaked memory = {}".format(leaked_memory)) # During training, the memory is held for a longer duration by PyTorch. # Often, this test fails with the following assert failing. # When the test is run individually, this test may still fail. # The tolerance is bumped up to take care of the situation where all tests are run. self.assertLessEqual(leaked_memory, 2000000)
def main(): args = arguments() seed(args) if args.model_path: model = torch.load(args.model_path) else: raise ValueError('Model path {} must be specified'.format( args.model_path)) model.eval() input_shape = (1, 3, 224, 224) image_size = input_shape[-1] eval_func_quant = model_eval(args.images_dir + '/val/', image_size, batch_size=args.batch_size, num_workers=0, quant=True) eval_func = model_eval(args.images_dir + '/val/', image_size, batch_size=args.batch_size, num_workers=16) from aimet_common.defs import QuantScheme from aimet_torch.quantsim import QuantizationSimModel if hasattr(args, 'quant_scheme'): if args.quant_scheme == 'range_learning_tf': quant_scheme = QuantScheme.training_range_learning_with_tf_init elif args.quant_scheme == 'range_learning_tfe': quant_scheme = QuantScheme.training_range_learning_with_tf_enhanced_init elif args.quant_scheme == 'tf': quant_scheme = QuantScheme.post_training_tf elif args.quant_scheme == 'tf_enhanced': quant_scheme = QuantScheme.post_training_tf_enhanced else: raise ValueError("Got unrecognized quant_scheme: " + args.quant_scheme) kwargs = { 'quant_scheme': quant_scheme, 'default_param_bw': args.default_param_bw, 'default_output_bw': args.default_output_bw, 'config_file': args.config_file } print(kwargs) sim = QuantizationSimModel(model.cpu(), input_shapes=input_shape, **kwargs) sim.compute_encodings(eval_func_quant, (32, True)) post_quant_top1 = eval_func(sim.model.cuda(), (0, True)) print("Post Quant Top1 :", post_quant_top1)
def test_parse_config_file_model_outputs(self): """ Test that model output quantization parameters are set correctly when using json config file """ model = SingleResidual() model.eval() quantsim_config = { "defaults": { "ops": {}, "params": {} }, "params": {}, "op_type": {}, "supergroups": [], "model_input": {}, "model_output": { "is_output_quantized": "True" } } with open('./data/quantsim_config.json', 'w') as f: json.dump(quantsim_config, f) sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf_enhanced, config_file='./data/quantsim_config.json', dummy_input=torch.rand(1, 3, 32, 32)) for name, module in sim.model.named_modules(): if isinstance(module, QcQuantizeWrapper): if name == 'fc': # model.conv3 and model.ada are inputs to add assert module.output_quantizers[0].enabled else: assert not module.output_quantizers[0].enabled assert not module.input_quantizer.enabled if os.path.exists('./data/quantsim_config.json'): os.remove('./data/quantsim_config.json')
def get_simulations(model, args): from aimet_common.defs import QuantScheme from aimet_torch.quantsim import QuantizationSimModel if hasattr(args, 'quant_scheme'): if args.quant_scheme == 'range_learning_tf': quant_scheme = QuantScheme.training_range_learning_with_tf_init elif args.quant_scheme == 'range_learning_tfe': quant_scheme = QuantScheme.training_range_learning_with_tf_enhanced_init elif args.quant_scheme == 'tf': quant_scheme = QuantScheme.post_training_tf elif args.quant_scheme == 'tf_enhanced': quant_scheme = QuantScheme.post_training_tf_enhanced else: raise ValueError("Got unrecognized quant_scheme: " + args.quant_scheme) kwargs = { 'quant_scheme': quant_scheme, 'default_param_bw': args.default_param_bw, 'default_output_bw': args.default_output_bw, 'config_file': args.config_file } print(kwargs) sim = QuantizationSimModel(model.cpu(), input_shapes=args.input_shape, **kwargs) return sim
def test_memory_leak_during_quantization_eval(self): # First get baseline numbers base_pre_model_load_mark = torch.cuda.memory_allocated() model = models.vgg16(pretrained=True) model = model.to(torch.device('cuda')) base_model_loaded_mark = torch.cuda.memory_allocated() _ = model_eval(model=model, early_stopping_iterations=10) base_model_eval_mark = torch.cuda.memory_allocated() base_model_eval_delta = base_model_eval_mark - base_model_loaded_mark print("Usage Report ------") print("Model pre-load = {}".format(base_pre_model_load_mark)) print("Model load = {}".format(base_model_loaded_mark)) print("Model eval delta = {}".format(base_model_eval_delta)) del model print("Leaked during eval = {}".format(torch.cuda.memory_allocated() - base_pre_model_load_mark)) model = models.vgg16(pretrained=True) model = model.to(torch.device('cuda')) base_model_loaded_mark = torch.cuda.memory_allocated() # Now use AIMET sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf_enhanced, default_param_bw=8, default_output_bw=4, dummy_input=torch.rand(1, 3, 224, 224).cuda()) sim.compute_encodings(model_eval, forward_pass_callback_args=1) aimet_model_quantize_mark = torch.cuda.memory_allocated() aimet_model_quantize_delta = aimet_model_quantize_mark - base_model_loaded_mark for i in range(1): _ = model_eval(model=sim.model, early_stopping_iterations=10) aimet_model_eval_mark = torch.cuda.memory_allocated() aimet_model_eval_delta = aimet_model_eval_mark - aimet_model_quantize_mark print("") print("Usage Report ------") print("Model load = {}".format(base_model_loaded_mark)) print("AIMET quantize delta = {}".format(aimet_model_quantize_delta)) print("AIMET eval delta = {}".format(aimet_model_eval_delta)) self.assertEqual(0, aimet_model_eval_delta)
def test_parse_config_file_supergroups(self): """ Test that supergroup quantization parameters are set correctly when using json config file """ model = TinyModel() model.eval() quantsim_config = { "defaults": { "ops": { "is_output_quantized": "True", "is_symmetric": "False" }, "params": { "is_quantized": "False", "is_symmetric": "False" } }, "params": {}, "op_type": {}, "supergroups": [ { "op_list": ["Conv", "BatchNormalization"] }, { "op_list": ["Relu", "MaxPool"] }, { "op_list": ["Conv", "Relu", "AveragePool"] } ], "model_input": {}, "model_output": {} } with open('./data/quantsim_config.json', 'w') as f: json.dump(quantsim_config, f) # Use in_place=True here for easy access to modules through model instance variables sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf_enhanced, config_file='./data/quantsim_config.json', in_place=True, dummy_input=torch.rand(1, 3, 32, 32)) for _, module in sim.model.named_modules(): if isinstance(module, QcQuantizeWrapper): # Check configs for starts of supergroups if module in [model.conv1, model.relu1, model.conv2, model.conv3]: assert not module.output_quantizers[0].enabled # Check configs for middle ops in supergroups elif module == model.relu3: assert not module.input_quantizer.enabled assert not module.output_quantizers[0].enabled # Check configs for ends of supergroups elif module in [model.bn1, model.maxpool, model.bn2, model.avgpool]: assert not module.input_quantizer.enabled assert module.output_quantizers[0].enabled else: assert not module.input_quantizer.enabled assert module.output_quantizers[0].enabled if os.path.exists('./data/quantsim_config.json'): os.remove('./data/quantsim_config.json')
def test_quantize_resnet18(self): torch.cuda.empty_cache() # Train the model using tiny imagenet data model = models.resnet18(pretrained=False) _ = model_train(model, epochs=2) model = model.to(torch.device('cuda')) # layers_to_ignore = [model.conv1] sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf, default_param_bw=8, default_output_bw=8, dummy_input=torch.rand(1, 3, 224, 224).cuda()) print(sim.model) # If 'iterations'set to None, will iterate over all the validation data sim.compute_encodings(model_eval, forward_pass_callback_args=400) quantized_model_accuracy = model_eval(model=sim.model, early_stopping_iterations=None) print("Quantized model accuracy=", quantized_model_accuracy) self.assertGreaterEqual(quantized_model_accuracy, 0.5)
def test_quantsim_export(self): torch.manual_seed(10) model = Model2(Add()) dummy_input = torch.randn(5, 10, 10, 20) sim = QuantizationSimModel(model, dummy_input) encodings = libpymo.TfEncoding() encodings.bw = 8 encodings.max = 5 encodings.min = -5 encodings.delta = 1 encodings.offset = 0.2 sim.model.op1.output_quantizer.encoding = encodings sim.model.conv1.output_quantizer.encoding = encodings sim.model.conv1.param_quantizers['weight'].encoding = encodings sim.export(path='./data', filename_prefix='quant_model', dummy_input=dummy_input) with open('./data/quant_model.encodings') as f: data = json.load(f) self.assertTrue(isinstance(data['activation_encodings']['3'], list)) self.assertTrue(isinstance(data['activation_encodings']['4'], list))
def test_retraining_on_quantized_model_first_step(self): torch.cuda.empty_cache() model = mnist_model.Net().to(torch.device('cuda')) sim = QuantizationSimModel(model, default_output_bw=4, default_param_bw=4, dummy_input=torch.rand(1, 1, 28, 28).cuda()) # Quantize the untrained MNIST model sim.compute_encodings(self.forward_pass, forward_pass_callback_args=5) # train the model for entire one epoch mnist_model.train(model=sim.model, epochs=1, num_batches=3, batch_callback=check_if_layer_weights_are_updating, use_cuda=True) # Checkpoint the model save_checkpoint(sim, os.path.join(path, 'checkpoint.pt'))
def test_with_finetuning(self): torch.cuda.empty_cache() model = mnist_model.Net().to(torch.device('cuda')) mnist_torch_model.evaluate(model=model, iterations=None, use_cuda=True) sim = QuantizationSimModel(model, dummy_input=torch.rand(1, 1, 28, 28).cuda()) # Quantize the untrained MNIST model sim.compute_encodings(self.forward_pass, forward_pass_callback_args=5) # Run some inferences mnist_torch_model.evaluate(model=sim.model, iterations=None, use_cuda=True) # train the model again mnist_model.train(sim.model, epochs=1, num_batches=3, batch_callback=check_if_layer_weights_are_updating, use_cuda=True)
def test_parse_config_file_defaults(self): """ Test that default quantization parameters are set correctly when using json config file """ model = SingleResidual() model.eval() quantsim_config = { "defaults": { "ops": { "is_output_quantized": "True", "is_symmetric": "False" }, "params": { "is_quantized": "False", "is_symmetric": "True" } }, "params": {}, "op_type": {}, "supergroups": [], "model_input": {}, "model_output": {} } with open('./data/quantsim_config.json', 'w') as f: json.dump(quantsim_config, f) sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf_enhanced, config_file='./data/quantsim_config.json', dummy_input=torch.rand(1, 3, 32, 32), in_place=True) for name, module in sim.model.named_modules(): if isinstance(module, QcQuantizeWrapper): # Output of add op is input quantized if name == 'relu3': assert module.input_quantizer.enabled else: assert not module.input_quantizer.enabled assert module.output_quantizers[0].enabled assert not module.input_quantizer.use_symmetric_encodings assert not module.output_quantizers[0].use_symmetric_encodings if module.param_quantizers: for _, param_quantizer in module.param_quantizers.items(): assert not param_quantizer.enabled assert param_quantizer.use_symmetric_encodings if os.path.exists('./data/quantsim_config.json'): os.remove('./data/quantsim_config.json')
def test_supergroups_with_elementwise_add(self): """ Test that supergroup quantization parameters are set correctly when using json config file """ model = SingleResidual() model.eval() quantsim_config = { "defaults": { "ops": { "is_output_quantized": "True" }, "params": {} }, "params": {}, "op_type": {}, "supergroups": [ { "op_list": ["Add", "Relu"] } ], "model_input": {}, "model_output": {} } with open('./data/quantsim_config.json', 'w') as f: json.dump(quantsim_config, f) # Use in_place=True here for easy access to modules through model instance variables sim = QuantizationSimModel(model, quant_scheme=QuantScheme.post_training_tf_enhanced, config_file='./data/quantsim_config.json', in_place=True, dummy_input=torch.rand(1, 3, 32, 32)) for _, module in sim.model.named_modules(): if isinstance(module, QcQuantizeWrapper): # Check configs for starts of supergroups if module == model.relu3: # If add were not part of the supergroup, relu's input quantizer would be enabled assert not module.input_quantizer.enabled if os.path.exists('./data/quantsim_config.json'): os.remove('./data/quantsim_config.json')
def export_and_generate_encodings(model, params): os.makedirs(params.log_path) enc_ds = create_encoder_dataset(params, return_type='dataset') def evaluator_enc(model, iterations): for query_id in tqdm(range(enc_ds.get_item_count())): query_ids = [query_id] enc_ds.load_query_samples(query_ids) img, label = enc_ds.get_samples(query_ids) with torch.no_grad(): _ = model(img) enc_ds.unload_query_samples(query_ids) quantizer = QuantizationSimModel(model=model, input_shapes=params.input_shape_tuple, quant_scheme=params.quant_scheme, rounding_mode=params.rounding_mode, default_output_bw=params.default_bitwidth, default_param_bw=params.default_bitwidth, in_place=False, config_file=params.config_file) quantizer_modifications(quantizer) quantizer.compute_encodings(forward_pass_callback=evaluator_enc, forward_pass_callback_args=1) quantizer.export(path=params.log_path, filename_prefix=params.filename_prefix, input_shape=params.input_shape_tuple) input_file = os.path.join(params.log_path, '%s.encodings' % str(params.filename_prefix)) remap_bitwidth_to_32(input_file) with open(os.path.join(params.log_path, params.my_filename), 'wb') as f: pickle.dump(params, f) return quantizer
def quantize_model(trainer_function): model = mnist_torch_model.Net().to(torch.device('cuda')) sim = QuantizationSimModel( model, default_output_bw=8, default_param_bw=8, dummy_input=torch.rand(1, 1, 28, 28), config_file= '../../../TrainingExtensions/common/src/python/aimet_common/quantsim_config/' 'default_config.json') # Quantize the untrained MNIST model sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5) # Fine-tune the model's parameter using training trainer_function(model=sim.model, epochs=1, num_batches=100, use_cuda=True) # Export the model sim.export(path='./', filename_prefix='quantized_mnist', dummy_input=torch.rand(1, 1, 28, 28))
def test_and_compare_quantizer_no_fine_tuning_CPU_and_GPU(self): torch.manual_seed(1) torch.backends.cudnn.deterministic = True dummy_input = torch.rand(1, 1, 28, 28) dummy_input_cuda = dummy_input.cuda() start_time = time.time() # create model on CPU model_cpu = mnist_model.Net().to('cpu') model_gpu = copy.deepcopy(model_cpu).to('cuda') cpu_sim_model = QuantizationSimModel(model_cpu, quant_scheme='tf', in_place=True, dummy_input=dummy_input) # Quantize cpu_sim_model.compute_encodings(forward_pass, None) print("Encodings for cpu model calculated") print("Took {} secs".format(time.time() - start_time)) start_time = time.time() # create model on GPU gpu_sim_model = QuantizationSimModel(model_gpu, quant_scheme='tf', in_place=True, dummy_input=dummy_input_cuda) # Quantize gpu_sim_model.compute_encodings(forward_pass, None) print("Encodings for gpu model calculated") print("Took {} secs".format(time.time() - start_time)) # check the encodings only min and max # Test that first and second are approximately (or not approximately) # equal by computing the difference, rounding to the given number of # decimal places (default 7), and comparing to zero. Note that these # methods round the values to the given number of decimal places # (i.e. like the round() function) and not significant digits # excluding fc1 since it is part of Matmul->Relu supergroup # can't use assertEqual for FC2, so using assertAlmostEquals for FC2 self.assertAlmostEqual( model_gpu.conv1.output_quantizers[0].encoding.min, model_cpu.conv1.output_quantizers[0].encoding.min, delta=0.001) self.assertAlmostEqual( model_gpu.conv1.output_quantizers[0].encoding.max, model_cpu.conv1.output_quantizers[0].encoding.max, delta=0.001) self.assertAlmostEqual( model_gpu.conv2.output_quantizers[0].encoding.min, model_cpu.conv2.output_quantizers[0].encoding.min, delta=0.001) self.assertAlmostEqual( model_gpu.conv2.output_quantizers[0].encoding.max, model_cpu.conv2.output_quantizers[0].encoding.max, delta=0.001) self.assertAlmostEqual(model_gpu.fc2.output_quantizers[0].encoding.min, model_cpu.fc2.output_quantizers[0].encoding.min, delta=0.001) self.assertAlmostEqual(model_gpu.fc2.output_quantizers[0].encoding.max, model_cpu.fc2.output_quantizers[0].encoding.max, delta=0.001) gpu_sim_model.export("./data/", "quantizer_no_fine_tuning__GPU", dummy_input) cpu_sim_model.export("./data/", "quantizer_no_fine_tuning__CPU", dummy_input) self.assertEqual(torch.device('cuda:0'), next(model_gpu.parameters()).device) self.assertEqual(torch.device('cpu'), next(model_cpu.parameters()).device)
def quantize_model(model, bitwidth=8, layerwise_bitwidth=None, retrain=True, ref_model=None, flags=None, adaround=False, lr=0.00000001): res = check_metrics(dataloader, model, image_resolution) print(res) input_shape = coord_dataset.mgrid.shape dummy_in = ((torch.rand(input_shape).unsqueeze(0) * 2) - 1).cuda() aimet_dataloader = DataLoader(AimetDataset(coord_dataset), shuffle=True, batch_size=1, pin_memory=True, num_workers=0) # Create QuantSim using adarounded_model sim = QuantizationSimModel(model, default_param_bw=bitwidth, default_output_bw=31, dummy_input=dummy_in) modules_to_exclude = ( Sine, ImageDownsampling, PosEncodingNeRF, FourierFeatureEncodingPositional, FourierFeatureEncodingGaussian) excl_layers = [] for mod in sim.model.modules(): if isinstance(mod, QcPostTrainingWrapper) and isinstance(mod._module_to_wrap, modules_to_exclude): excl_layers.append(mod) sim.exclude_layers_from_quantization(excl_layers) i = 0 for name, mod in sim.model.named_modules(): if isinstance(mod, QcPostTrainingWrapper): mod.output_quantizer.enabled = False mod.input_quantizer.enabled = False weight_quantizer = mod.param_quantizers['weight'] bias_quantizer = mod.param_quantizers['bias'] weight_quantizer.use_symmetric_encodings = True bias_quantizer.use_symmetric_encodings = True if torch.count_nonzero(mod._module_to_wrap.bias.data): mod.param_quantizers['bias'].enabled = True if layerwise_bitwidth: mod.param_quantizers['bias'].bitwidth = layerwise_bitwidth[i] mod.param_quantizers['weight'].bitwidth = layerwise_bitwidth[i] i += 1 res = check_metrics(dataloader, sim.model, image_resolution) print(res) if adaround: params = AdaroundParameters(data_loader=aimet_dataloader, num_batches=1, default_num_iterations=500, default_reg_param=0.001, default_beta_range=(20, 2)) # adarounded_model_1 = Adaround.apply_adaround(model=model, dummy_input=dummy_in, params=params,path='', filename_prefix='adaround', # default_param_bw=bitwidth, ignore_quant_ops_list=excl_layers ) # Compute only param encodings Adaround._compute_param_encodings(sim) # Get the module - activation function pair using ConnectedGraph module_act_func_pair = connectedgraph_utils.get_module_act_func_pair(model, dummy_in) Adaround._adaround_model(model, sim, module_act_func_pair, params, dummy_in) #res = check_metrics(dataloader, sim.model, image_resolution) #print('1st stage ada round ', res) # Update every module (AdaroundSupportedModules) weight with Adarounded weight (Soft rounding) Adaround._update_modules_with_adarounded_weights(sim) path='' # from aimet_torch.cross_layer_equalization import equalize_model # equalize_model(model, input_shape) # params = QuantParams(weight_bw=4, act_bw=4, round_mode="nearest", quant_scheme='tf_enhanced') # # # Perform Bias Correction # bias_correction.correct_bias(model.to(device="cuda"), params, num_quant_samples=1, # data_loader=aimet_dataloader, num_bias_correct_samples=1) # torch.save(sim.model, # os.path.join( # os.path.join(exp_folder, # image_name + '/checkpoints/model_aimet_quantized.pth'))) quantized_model = sim.model #res = check_metrics(dataloader, sim.model, image_resolution) #print('After Adaround ', res) # # if retrain: # loss_fn = partial(loss_functions.image_mse, None) # #quantized_model = retrain_model(sim.model, dataloader, 200, loss_fn, 0.0000005, flags['l1_reg'] if flags is not None else 0) # quantized_model = retrain_model(sim.model, dataloader, 300, loss_fn, lr, # flags['l1_reg'] if flags is not None else 0) # # Fine-tune the model's parameter using training # # torch.save(quantized_model, # # os.path.join( # # os.path.join(exp_folder, # # image_name + '/checkpoints/model_aimet_quantized_retrained.pth'))) # res = check_metrics(dataloader, quantized_model, image_resolution) # print('After retraining ',res) # state_dict ={} # quantized_dict = {} # for name, module in sim.model.named_modules(): # if isinstance(module, QcPostTrainingWrapper) and isinstance(module._module_to_wrap, torch.nn.Linear): # weight_quantizer = module.param_quantizers['weight'] # bias_quantizer = module.param_quantizers['bias'] # weight_quantizer.enabled = True # bias_quantizer.enabled = True # weight_quantizer.use_soft_rounding = False # bias_quantizer.use_soft_rounding = False # wrapped_linear = module._module_to_wrap # weight = wrapped_linear.weight # bias = wrapped_linear.bias # if not (torch.all(weight < weight_quantizer.encoding.max) and torch.all( # weight > weight_quantizer.encoding.min)): # print("not within bounds") # # weight_dequant = weight_quantizer.quantize_dequantize(weight, # weight_quantizer.round_mode).cpu().detach() # state_dict[name + '.weight'] = weight_dequant # # assert(len(torch.unique(state_dict[name + '.weight'])) <= 2**bitwidth) # bias_dequant = bias_quantizer.quantize_dequantize(bias, # bias_quantizer.round_mode).cpu().detach() # state_dict[name + '.bias'] = bias_dequant # # assert (len(torch.unique(state_dict[name + '.bias'])) <= 2 ** bitwidth) # quantized_weight = weight_dequant / weight_quantizer.encoding.delta # quantized_bias = bias_dequant / bias_quantizer.encoding.delta # weights_csc = scipy.sparse.csc_matrix(quantized_weight + weight_quantizer.encoding.offset) # quantized_dict[name] = {'weight': {'data': quantized_weight, 'encoding': weight_quantizer.encoding}, # 'bias': {'data': quantized_bias, 'encoding': bias_quantizer.encoding}} # res = check_metrics(dataloader, quantized_model, image_resolution) # print('After hard rounding ', res) if adaround: filename_prefix = 'adaround' # Export quantization encodings to JSON-formatted file Adaround._export_encodings_to_json(path, filename_prefix, sim) #res = check_metrics(dataloader, sim.model, image_resolution) SaveUtils.remove_quantization_wrappers(sim.model) adarounded_model = sim.model #print('After Adaround ', res) sim = QuantizationSimModel(adarounded_model, default_param_bw=bitwidth, default_output_bw=31, dummy_input=dummy_in) for mod in sim.model.modules(): if isinstance(mod, QcPostTrainingWrapper) and isinstance(mod._module_to_wrap, modules_to_exclude): excl_layers.append(mod) sim.exclude_layers_from_quantization(excl_layers) i = 0 for name, mod in sim.model.named_modules(): if isinstance(mod, QcPostTrainingWrapper): mod.output_quantizer.enabled = False mod.input_quantizer.enabled = False weight_quantizer = mod.param_quantizers['weight'] bias_quantizer = mod.param_quantizers['bias'] weight_quantizer.use_symmetric_encodings = True bias_quantizer.use_symmetric_encodings = True if torch.count_nonzero(mod._module_to_wrap.bias.data): mod.param_quantizers['bias'].enabled = True if layerwise_bitwidth: mod.param_quantizers['bias'].bitwidth = layerwise_bitwidth[i] mod.param_quantizers['weight'].bitwidth = layerwise_bitwidth[i] i += 1 sim.set_and_freeze_param_encodings(encoding_path='adaround.encodings') # Quantize the untrained MNIST model #sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5) res = check_metrics(dataloader, sim.model, image_resolution) print(res) if retrain: loss_fn = partial(loss_functions.image_mse, None) #quantized_model = retrain_model(sim.model, dataloader, 200, loss_fn, 0.0000005, flags['l1_reg'] if flags is not None else 0) quantized_model = retrain_model(sim.model, dataloader, 1000, loss_fn, lr, flags['l1_reg'] if flags is not None else 0) #sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5) # Fine-tune the model's parameter using training # torch.save(quantized_model, # os.path.join( # os.path.join(exp_folder, # image_name + '/checkpoints/model_aimet_quantized_retrained.pth'))) res = check_metrics(dataloader, quantized_model, image_resolution) print('After retraining ',res) # # w = sim.model.net.net[0][0]._module_to_wrap.weight # q = sim.model.net.net[0][0].param_quantizers['weight'] # wq = q.quantize(w, q.round_mode) #Compute the difference for each parameter if ref_model is not None: new_state_dict=sim.model.state_dict() lis = [[i, j, a, b] for i, a in ref_model.named_parameters() for j, b in sim.model.named_parameters() if i == j.replace('._module_to_wrap','')] for module in lis: new_state_dict[module[1]] = module[3] - module[2] sim.model.load_state_dict(new_state_dict) #sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=1) quantized_dict = {} state_dict = {} for name, module in sim.model.named_modules(): if isinstance(module, QcPostTrainingWrapper) and isinstance(module._module_to_wrap, torch.nn.Linear): weight_quantizer = module.param_quantizers['weight'] bias_quantizer = module.param_quantizers['bias'] weight_quantizer.enabled = True bias_quantizer.enabled = True wrapped_linear = module._module_to_wrap weight = wrapped_linear.weight bias = wrapped_linear.bias if not (torch.all(weight < weight_quantizer.encoding.max) and torch.all(weight > weight_quantizer.encoding.min)): print("not within bounds") state_dict[name + '.weight'] = weight_quantizer.quantize_dequantize(weight,weight_quantizer.round_mode).cpu().detach() #assert(len(torch.unique(state_dict[name + '.weight'])) <= 2**bitwidth) state_dict[name + '.bias'] = bias_quantizer.quantize_dequantize(bias, bias_quantizer.round_mode).cpu().detach() #assert (len(torch.unique(state_dict[name + '.bias'])) <= 2 ** bitwidth) quantized_weight = weight_quantizer.quantize(weight, weight_quantizer.round_mode).cpu().detach().numpy() + weight_quantizer.encoding.offset quantized_bias = bias_quantizer.quantize(bias, bias_quantizer.round_mode).cpu().detach().numpy() + bias_quantizer.encoding.offset weights_csc = scipy.sparse.csc_matrix(quantized_weight + weight_quantizer.encoding.offset) quantized_dict[name] = {'weight': {'data': quantized_weight, 'encoding': weight_quantizer.encoding}, 'bias': {'data': quantized_bias, 'encoding': bias_quantizer.encoding}} weights_np = [] for l in quantized_dict.values(): w = l['weight']['data'] b = l['bias']['data'] Q = l['weight']['encoding'].bw if Q < 9: tpe = 'int8' elif Q < 17: tpe = 'int16' else: tpe = 'int32' w = w.astype(tpe).flatten() weights_np.append(w) if l['bias']['encoding']: Q = l['bias']['encoding'].bw if Q < 9: tpe = 'int8' elif Q < 17: tpe = 'int16' else: tpe = 'int32' b = b.astype(tpe).flatten() weights_np.append(b) weights_np = np.concatenate(weights_np) comp = zlib.compress(weights_np, level=9) print(len(comp)) # sim.export(path=os.path.join( # os.path.join(exp_folder, # image_name, 'checkpoints')), filename_prefix='model_aimet_quantized_retrained', dummy_input=dummy_in, set_onnx_layer_names=False) print(res) return quantized_model, res, len(comp), state_dict
def main(): args = arguments() seed(args) if args.checkpoint: model = torch.load(args.checkpoint) else: model = load_model() model.eval() input_shape = (1, 3, 224, 224) args.input_shape = input_shape image_size = input_shape[-1] data_loader_kwargs = { 'worker_init_fn': work_init, 'num_workers': args.num_workers } normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_transforms = transforms.Compose([ transforms.Resize(image_size + 24), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize ]) val_data = datasets.ImageFolder(args.images_dir + '/val/', val_transforms) val_dataloader = DataLoader(val_data, args.batch_size, shuffle=False, pin_memory=True, **data_loader_kwargs) eval_func_quant = model_eval(val_dataloader, image_size, batch_size=args.batch_size, quant=True) eval_func = model_eval(val_dataloader, image_size, batch_size=args.batch_size) if 'BNfold' in args.quant_tricks: print("BN fold") model, conv_bn_pairs = run_pytorch_bn_fold(args, model) if 'CLE' in args.quant_tricks: print("CLE") model = run_pytorch_cross_layer_equalization(args, model) if hasattr(args, 'quant_scheme'): if args.quant_scheme == 'range_learning_tf': quant_scheme = QuantScheme.training_range_learning_with_tf_init elif args.quant_scheme == 'range_learning_tfe': quant_scheme = QuantScheme.training_range_learning_with_tf_enhanced_init elif args.quant_scheme == 'tf': quant_scheme = QuantScheme.post_training_tf elif args.quant_scheme == 'tf_enhanced': quant_scheme = QuantScheme.post_training_tf_enhanced else: raise ValueError("Got unrecognized quant_scheme: " + args.quant_scheme) kwargs = { 'quant_scheme': quant_scheme, 'default_param_bw': args.default_param_bw, 'default_output_bw': args.default_output_bw, 'config_file': args.config_file } print(kwargs) sim = QuantizationSimModel(model.cpu(), input_shapes=input_shape, **kwargs) # Manually Config Super group, AIMET currently does not support [Conv-ReLU6] in a supergroup from aimet_torch.qc_quantize_op import QcPostTrainingWrapper for quant_wrapper in sim.model.modules(): if isinstance(quant_wrapper, QcPostTrainingWrapper): if isinstance(quant_wrapper._module_to_wrap, torch.nn.Conv2d): quant_wrapper.output_quantizer.enabled = False sim.model.blocks[0][0].conv_pw.output_quantizer.enabled = True sim.model.blocks[1][0].conv_pwl.output_quantizer.enabled = True sim.model.blocks[1][1].conv_pwl.output_quantizer.enabled = True sim.model.blocks[2][0].conv_pwl.output_quantizer.enabled = True sim.model.blocks[2][1].conv_pwl.output_quantizer.enabled = True sim.model.blocks[3][0].conv_pwl.output_quantizer.enabled = True sim.model.blocks[3][1].conv_pwl.output_quantizer.enabled = True sim.model.blocks[3][2].conv_pwl.output_quantizer.enabled = True sim.model.blocks[4][0].conv_pwl.output_quantizer.enabled = True sim.model.blocks[4][1].conv_pwl.output_quantizer.enabled = True sim.model.blocks[4][2].conv_pwl.output_quantizer.enabled = True sim.model.blocks[5][0].conv_pwl.output_quantizer.enabled = True sim.model.blocks[5][1].conv_pwl.output_quantizer.enabled = True sim.model.blocks[5][2].conv_pwl.output_quantizer.enabled = True sim.model.blocks[5][3].conv_pwl.output_quantizer.enabled = True sim.model.blocks[6][0].conv_pwl.output_quantizer.enabled = True sim.compute_encodings(eval_func_quant, (32, True)) print(sim) post_quant_top1 = eval_func(sim.model.cuda(), (0, True)) print("Post Quant Top1 :", post_quant_top1)