def test_s_prep_before_fusion(self): ( mod, sparsifier, sparse_config, ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm")) sparsifier.prepare(mod, config=sparse_config) tq.fuse_modules(mod, [["5", "6"]], inplace=True) mod[5].qconfig = tq.get_default_qconfig("fbgemm") tq.prepare(mod, inplace=True) # check that correct modules had parametrizations added and # that none were lost during prepare or fusion self.assertTrue(hasattr(mod[0], "parametrizations")) self.assertTrue(hasattr(mod[5][0], "parametrizations")) # check that correct observers were inserted and that matching # occured successfully self.assertTrue(hasattr(mod[5], "activation_post_process")) _squash_mask_calibrate_and_convert( mod, sparsifier, torch.randn(1, 4, 4, 4) ) # check that final module is the expected quantized module and that the model runs self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU)) self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
def _get_model_and_sparsifier_and_sparse_config(self, qconfig=None): model = nn.Sequential( nn.Linear(4, 4), # 0 nn.ReLU(), nn.Linear(4, 4), # 2 nn.ReLU(), tq.QuantStub(), nn.Linear(4, 4), # 5 nn.ReLU(), tq.DeQuantStub(), ) if qconfig is None: model[4].qconfig = tq.get_default_qconfig("fbgemm") model[5].qconfig = tq.get_default_qconfig("fbgemm") else: model[4].qconfig = qconfig model[5].qconfig = qconfig sparsifier = sparsity.WeightNormSparsifier(**sparse_defaults) sparse_config = [ { "module": model[5], "sparsity_level": 0.7, "sparse_block_shape": (1, 4), "zeros_per_block": 4, }, model[0], ] return model, sparsifier, sparse_config
def test_convert_without_squash_mask(self): ( mod, sparsifier, sparse_config, ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm")) sparsifier.prepare(mod, config=sparse_config) tq.prepare(mod, inplace=True) # check that correct modules had parametrizations added and # that none were lost during prepare self.assertTrue(hasattr(mod[0], "parametrizations")) self.assertTrue(hasattr(mod[5], "parametrizations")) # check that correct observers were inserted and that matching # occured successfully self.assertTrue(hasattr(mod[5], "activation_post_process")) sparsifier.step() sparsity_level = _calculate_sparsity(mod[5].weight) mod(torch.randn(1, 4, 4, 4)) tq.convert(mod, inplace=True) # check that final module is the expected quantized module and that the model runs self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) # check that module was actually sparsified cur_sparsity = _calculate_sparsity(mod[5]._weight_bias()[0]) self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level) self.assertGreaterAlmostEqual( sparsity_level, sparse_config[0]["sparsity_level"] ) self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
def graph_mode_quantize(self, inputs, data_loader, calibration_num_batches=64): """Quantize the model during export with graph mode quantization for linformer encoder.""" if ( isinstance(self.right_encoder, RoBERTaEncoder) and self.right_encoder.use_linformer_encoder and isinstance(self.left_encoder, RoBERTaEncoder) and self.left_encoder.use_linformer_encoder ): trace = self.trace(inputs) qconfig = get_default_qconfig("fbgemm") qconfig_dict = {"": qconfig} prepare_m = prepare_jit(trace, qconfig_dict, inplace=False) prepare_m.eval() with torch.no_grad(): for i, (_, batch) in enumerate(data_loader): print("Running calibration with batch {}".format(i)) input_data = self.onnx_trace_input(batch) prepare_m(*input_data) if i == calibration_num_batches - 1: break trace = convert_jit(prepare_m, inplace=True) else: super().quantize() trace = self.trace(inputs) return trace
def test_sparse_qlinear_serdes(self): # Note: At the moment, for sparse kernels # fbgemm supports only static quantized sparse linear # qnnpack supports only dynamically quantized sparse linear # Hence we have two different tests. # fbgemm tests static flow, qnnpack tests dynamic. # Should be unified later on and tests should be fixed # appropriately. model_class = SparseQuantizedModel fqn_to_check = "linear" if qengine_is_fbgemm(): sparse_mapping = tq.get_default_static_sparse_quant_module_mappings( ) ref_mapping = tq.get_default_static_quant_module_mappings() qconfig_dict = {nn.Linear: tq.get_default_qconfig("fbgemm")} elif qengine_is_qnnpack(): sparse_mapping = tq.get_default_dynamic_sparse_quant_module_mappings( ) ref_mapping = tq.get_default_dynamic_quant_module_mappings() qconfig_dict = {nn.Linear: tq.qconfig.default_dynamic_qconfig} else: return _sparse_layer_test_helper( model_class=model_class, sparse_mapping=sparse_mapping, ref_mapping=ref_mapping, qconfig_dict=qconfig_dict, fqn_to_check=fqn_to_check, test_class=self, test_scripting=True, )
def graph_mode_quantize( self, inputs, data_loader, calibration_num_batches=64, qconfig_dict=None, force_quantize=False, ): """Quantize the model during export with graph mode quantization.""" if force_quantize: trace = self.trace(inputs) if not qconfig_dict: qconfig_dict = {"": get_default_qconfig("fbgemm")} prepare_m = prepare_jit(trace, qconfig_dict, inplace=False) prepare_m.eval() with torch.no_grad(): for i, (_, batch) in enumerate(data_loader): print("Running calibration with batch {}".format(i)) input_data = self.onnx_trace_input(batch) prepare_m(*input_data) if i == calibration_num_batches - 1: break trace = convert_jit(prepare_m, inplace=True) else: super().quantize() trace = self.trace(inputs) return trace
def test_fusion_before_s_prep(self): ( mod, sparsifier, _, ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm")) tq.fuse_modules(mod, [["5", "6"]], inplace=True) # its absolutely broken by fusion but will still work if you put the correct fqn in sparse_config = [ { "tensor_fqn": "5.0.weight", "sparsity_level": 0.7, "sparse_block_shape": (1, 4), "zeros_per_block": 4, }, {"tensor_fqn": "0.weight"}, ] sparsifier.prepare(mod, config=sparse_config) mod[5].qconfig = tq.get_default_qconfig("fbgemm") tq.prepare(mod, inplace=True) # check that correct modules had parametrizations added and # that none were lost during prepare self.assertTrue(hasattr(mod[0], "parametrizations")) self.assertTrue(hasattr(mod[5][0], "parametrizations")) # check that correct observers were inserted and that matching # occured successfully self.assertTrue(hasattr(mod[5], "activation_post_process")) sparsifier.step() sparsity_level = _calculate_sparsity(mod[5][0].weight) mod(torch.randn(1, 4, 4, 4)) tq.convert(mod, inplace=True) # check that final module is the expected quantized module and that the model runs self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU)) self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) # check that module was actually sparsified cur_sparsity = _calculate_sparsity(mod[5]._weight_bias()[0]) self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level) self.assertGreaterAlmostEqual( sparsity_level, sparse_config[0]["sparsity_level"] ) self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
def test_q_prep_fx_before_s_prep(self): r""" This test checks that the ordering of prepare_fx -> sparse prepare -> convert_fx compose cleanly without issue and that the final result is sparsified without having to call squash mask between sparse prepare and convert_fx. This also tests the automatic fusion that occurs during prepare_fx. """ ( mod, sparsifier, _, ) = _get_model_and_sparsifier_and_sparse_config() example = torch.randn(1, 4, 4, 4) qconfig = tq.get_default_qconfig("fbgemm") qconfig_mapping = tq.QConfigMapping() \ .set_module_name("4", qconfig) \ .set_module_name("5", qconfig) mod = prepare_fx(mod, qconfig_mapping, (example,)) # its absolutely broken by auto fusion in fx # but will still work if you put the correct fqn in sparse_config = [ { "tensor_fqn": "5.0.weight", "sparsity_level": 0.7, "sparse_block_shape": (1, 4), "zeros_per_block": 4, }, {"tensor_fqn": "0.0.weight"}, ] sparsifier.prepare(mod, config=sparse_config) # check that correct modules had parametrizations added and # that none were lost during prepare self.assertTrue(hasattr(fqn_to_module(mod, "0.0"), "parametrizations")) self.assertTrue(hasattr(fqn_to_module(mod, "5.0"), "parametrizations")) # check that correct observers were inserted and that matching # occured successfully self.assertTrue(_module_has_activation_post_process(mod, "5")) sparsifier.step() sparsity_level = _calculate_sparsity(fqn_to_module(mod, "5.0.weight")) mod(example) mod = convert_fx(mod) # check that final module is the expected quantized module and that the model runs self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.quantized.LinearReLU)) self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4])) # check that module was actually sparsified cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5")._weight_bias()[0]) self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level) self.assertGreaterAlmostEqual( sparsity_level, sparse_config[0]["sparsity_level"] ) self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
def test_s_prep_q_prep_fx_ref(self): r""" This checks that the ordering: sparse prepare -> prepare_fx -> convert_to_reference_fx compose cleanly without issue and that the final result is sparsified without having to call squash mask before convert_to_reference_fx. """ ( mod, sparsifier, sparse_config, ) = _get_model_and_sparsifier_and_sparse_config() sparsifier.prepare(mod, config=sparse_config) example = torch.randn(1, 4, 4, 4) qconfig = tq.get_default_qconfig("fbgemm") qconfig_mapping = tq.QConfigMapping() \ .set_module_name("4", qconfig) \ .set_module_name("5", qconfig) mod = prepare_fx(mod, qconfig_mapping, (example,)) # check that correct modules had parametrizations added and # that none were lost during prepare self.assertTrue(hasattr(fqn_to_module(mod, "0.0"), "parametrizations")) self.assertTrue(hasattr(fqn_to_module(mod, "5.0"), "parametrizations")) # check that correct observers were inserted and that matching # occured successfully self.assertTrue(_module_has_activation_post_process(mod, "5")) sparsifier.step() sparsity_level = _calculate_sparsity(fqn_to_module(mod, "5.0.weight")) mod(example) mod = convert_to_reference_fx(mod) # check that final module is the expected quantized module and that the model runs self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.LinearReLU)) self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4])) self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.nn.quantized._reference.Linear)) # check that module was actually sparsified cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5.0.weight")) self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level) self.assertGreaterAlmostEqual( sparsity_level, sparse_config[0]["sparsity_level"] ) self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
def test_q_prep_before_s_prep(self): ( mod, sparsifier, sparse_config, ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm")) tq.prepare(mod, inplace=True) sparsifier.prepare(mod, config=sparse_config) # check that correct modules had parametrizations added self.assertTrue(hasattr(mod[0], "parametrizations")) self.assertTrue(hasattr(mod[5], "parametrizations")) # check that correct observers were inserted self.assertTrue(hasattr(mod[5], "activation_post_process")) _squash_mask_calibrate_and_convert( mod, sparsifier, torch.randn(1, 4, 4, 4) ) # check that final module is the expected quantized module and that the model runs self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
def test_post_training_static_quantization(self, root_dir): """Validate post-training static quantization.""" seed_everything(100) model = TestModule() num_epochs = 4 static_quantization = PostTrainingQuantization( qconfig_dicts={"": { "": get_default_qconfig() }}) trainer = Trainer( default_root_dir=os.path.join(root_dir, "quantized"), enable_checkpointing=False, callbacks=[static_quantization], max_epochs=num_epochs, logger=False, ) # This will both train the model + quantize it. trainer.fit(model) self.assertIsNotNone(static_quantization.quantized) # Default qconfig requires calibration. self.assertTrue(static_quantization.should_calibrate) test_in = torch.randn(12, 32) with mode(model, training=False) as m: base_out = m(test_in) with mode(static_quantization.quantized, training=False) as q: test_out = q(test_in) # While quantized/original won't be exact, they should be close. self.assertLess( ((((test_out - base_out)**2).sum(axis=1))**(1 / 2)).mean(), 0.015, "RMSE should be less than 0.015 between quantized and original.", )
def test_sparse_qlinear_serdes(self): batch_size = 12 input_channels = 4 output_channels = 7 model = self.SparseQuantizedModel(input_channels, output_channels) # For sparse kernels both the activation and weight ZP = 0 X_scale = 0.2 X_zp = 0 W_scale = 1e-2 W_zp = 0 with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()): X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32) float_bias = torch.randn(output_channels, dtype=torch.float32) X_q = torch.quantize_per_tensor(X_fp32, scale=X_scale, zero_point=X_zp, dtype=torch.quint8) X_fp32 = X_q.dequantize() W_fp32 = torch.randn(output_channels, input_channels, dtype=torch.float32) mask = torch.randint(0, 2, W_fp32.shape) W_fp32 *= mask W_q = torch.quantize_per_tensor(W_fp32, W_scale, W_zp, torch.qint8) model.linear.weight = nn.Parameter(W_q.dequantize()) model.linear.sparse_params = {'sparse_block_shape': (1, 4)} model.eval() # Note: At the moment, for sparse kernels # fbgemm supports only static quantized sparse linear # qnnpack supports only dynamically quantized sparse linear # Hence we have two different tests. # fbgemm tests static flow, qnnpack tests dynamic. # Should be unified later on and tests should be fixed # appropriately. if qengine_is_fbgemm(): model.qconfig = tq.get_default_qconfig('fbgemm') qmodel = copy.deepcopy(model) sqmodel = copy.deepcopy(model) tq.prepare(qmodel, inplace=True) tq.prepare(sqmodel, inplace=True) with torch.no_grad(): qmodel(X_fp32) sqmodel(X_fp32) # Make sure the quantization parameters are computed the same way qparams = qmodel.linear.qconfig.weight().calculate_qparams() sqparams = sqmodel.linear.qconfig.weight().calculate_qparams() self.assertEqual(qparams, sqparams) # Make sure mapping of sparse kernels does not affect the non-sparse sparse_mapping = tq.get_default_static_quant_module_mappings() sparse_mapping[nn.Linear] = ao_nn_sq.Linear tq.convert(sqmodel, inplace=True, mapping=sparse_mapping) tq.convert(qmodel, inplace=True) assert isinstance(sqmodel.linear, ao_nn_sq.Linear), "Convert failed" assert isinstance(qmodel.linear, nn.quantized.Linear), "Mapping failed" scripted_sqmodel = torch.jit.script(sqmodel) scripted_sqmodel.eval() buffer = io.BytesIO() torch.jit.save(scripted_sqmodel, buffer) buffer.seek(0) sqmodel = torch.jit.load(buffer) # Make sure numerics are right Y_ref = qmodel(X_q) Y_hat = sqmodel(X_q) self.assertEqual(Y_ref.dequantize(), Y_hat.dequantize()) elif qengine_is_qnnpack(): qconfig = {nn.Linear: tq.qconfig.default_dynamic_qconfig} dqmodel = copy.deepcopy(model) sdqmodel = copy.deepcopy(model) tq.propagate_qconfig_(dqmodel, qconfig) tq.propagate_qconfig_(sdqmodel, qconfig) # Make sure the quantization parameters are computed the same way qparams = dqmodel.linear.qconfig.weight().calculate_qparams() sqparams = sdqmodel.linear.qconfig.weight().calculate_qparams() self.assertEqual(qparams, sqparams) # Make sure mapping of sparse kernels does not affect the non-sparse sparse_mapping = copy.deepcopy( tq.get_default_dynamic_quant_module_mappings()) sparse_mapping[nn.Linear] = ao_nn_sq.dynamic.Linear with LinearBlockSparsePattern(1, 4): tq.convert(sdqmodel, inplace=True, mapping=sparse_mapping) tq.convert( dqmodel, mapping=tq.get_default_dynamic_quant_module_mappings(), inplace=True) assert isinstance(sdqmodel.linear, ao_nn_sq.dynamic.Linear), "Convert failed" assert isinstance( dqmodel.linear, nn.quantized.dynamic.Linear), "Mapping failed" scripted_sdqmodel = torch.jit.script(sdqmodel) scripted_sdqmodel.eval() buffer = io.BytesIO() torch.jit.save(scripted_sdqmodel, buffer) buffer.seek(0) sdqmodel = torch.jit.load(buffer) # Make sure numerics are right Y_ref = dqmodel(X_fp32) Y_hat = sdqmodel(X_fp32) self.assertEqual(Y_ref, Y_hat)