def test_sparse_qlinear_serdes(self): # Note: At the moment, for sparse kernels # fbgemm supports only static quantized sparse linear # qnnpack supports only dynamically quantized sparse linear # Hence we have two different tests. # fbgemm tests static flow, qnnpack tests dynamic. # Should be unified later on and tests should be fixed # appropriately. model_class = SparseQuantizedModel fqn_to_check = "linear" if qengine_is_fbgemm(): sparse_mapping = tq.get_default_static_sparse_quant_module_mappings( ) ref_mapping = tq.get_default_static_quant_module_mappings() qconfig_dict = {nn.Linear: tq.get_default_qconfig("fbgemm")} elif qengine_is_qnnpack(): sparse_mapping = tq.get_default_dynamic_sparse_quant_module_mappings( ) ref_mapping = tq.get_default_dynamic_quant_module_mappings() qconfig_dict = {nn.Linear: tq.qconfig.default_dynamic_qconfig} else: return _sparse_layer_test_helper( model_class=model_class, sparse_mapping=sparse_mapping, ref_mapping=ref_mapping, qconfig_dict=qconfig_dict, fqn_to_check=fqn_to_check, test_class=self, test_scripting=True, )
def test_qlinear_packed_params_qnnpack(self): torch.manual_seed(0) with override_quantized_engine('qnnpack'): with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()): self.test_qlinear_packed_params( allow_non_zero_zero_points=True)
def test_qlinear_packed_params(self, allow_non_zero_zero_points=False): # copied from https://pytorch.org/docs/stable/sparse.html#csr-tensor-operations, # so row/col block indices match that example, but with blocks and # scaled rows weight_fp32 = torch.Tensor([ [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0], [6, 6, 6, 6, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ]) row_block_size = 1 col_block_size = 4 out_features = weight_fp32.shape[0] in_features = weight_fp32.shape[1] scales = [2.0, 6.0, 12.0] zero_points = [((i + 1) if allow_non_zero_zero_points else 0) for i in range(out_features)] dtype = torch.qint8 wide_weight_fp32 = torch.zeros( (3, 4008)) # 4000 is tile width for Fbgemm wide_weight_fp32[0][0] = 4 wide_weight_fp32[0][4004] = 6 wide_weight_fp32[1][0] = 8 per_tensor_small = ( torch.quantize_per_tensor(weight_fp32, scales[0], zero_points[0], dtype), True, [0, 1, 3, 3], [2, 0, 1], [ x + (1 if allow_non_zero_zero_points else 0) for x in [1, 1, 1, 1, 3, 3, 3, 3, 6, 6, 6, 6] ], ) per_channel_small = ( torch.quantize_per_channel( weight_fp32, torch.Tensor(scales), torch.Tensor(zero_points).to(torch.int), 0, # axis = 0 dtype, ), False, [0, 1, 3, 3], [2, 0, 1], [ x + ([1, 2, 2][i // 4] if allow_non_zero_zero_points else 0) for (i, x) in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]) ], ) per_tensor_large = ( torch.quantize_per_tensor( wide_weight_fp32, scales[0], zero_points[0], dtype, ), True, [0, 2, 3, 3], [0, 1001, 0], [ x + (1 if allow_non_zero_zero_points else 0) for x in [2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0] ], ) for (weight, is_per_tensor_quantized, expected_row_block_indices, expected_col_block_indices, expected_weights) in [ per_tensor_small, per_channel_small, per_tensor_large ]: lin = Linear( out_features=weight.shape[0], in_features=weight.shape[1], row_block_size=row_block_size, col_block_size=col_block_size, bias=True, dtype=dtype, ) bias = torch.ones(size=(weight.shape[0], )) lin.set_weight_bias(weight, bias, row_block_size, col_block_size) serialized = lin._packed_params._packed_params.__getstate__() ( _, # version bias_, out_features_block_size_, in_features_block_size_, weight_scales_, weight_zero_points_, quantization_scheme_, row_block_indices_, col_block_indices_, weights_, output_channels_, input_channels_) = serialized[0] # Test Serialization self.assertEqual(bias_, bias) self.assertEqual(out_features_block_size_, row_block_size) self.assertEqual(in_features_block_size_, col_block_size) self.assertEqual( weight_scales_, [scales[0]] if is_per_tensor_quantized else scales) self.assertEqual( weight_zero_points_, [zero_points[0]] if is_per_tensor_quantized else zero_points) self.assertEqual(quantization_scheme_, is_per_tensor_quantized) self.assertEqual(row_block_indices_, expected_row_block_indices) self.assertEqual(col_block_indices_, expected_col_block_indices) self.assertEqual( weights_.tolist(), [v + 128 for v in expected_weights]) # weights are serialized as +128 self.assertEqual(output_channels_, weight.shape[0]) self.assertEqual(input_channels_, weight.shape[1]) # Test Unpacking (weights_, bias_, out_features_block_size_, in_features_block_size_) = lin._weight_bias() self.assertEqual(torch.dequantize(weights_), torch.dequantize(weight)) self.assertEqual(bias_, bias) self.assertEqual(out_features_block_size_, row_block_size) self.assertEqual(in_features_block_size_, col_block_size) # Test Deserialization with tempfile.TemporaryFile() as file_buff: torch.save(lin, file_buff) file_buff.seek(0) lin2 = torch.load(file_buff) self.assertEqual(lin._weight_bias(), lin2._weight_bias()) # Serialize -> Deserialize -> Serialize should match Serialize self.assertEqual( serialized, lin2._packed_params._packed_params.__getstate__()) # Test that op output is preserved by serialize -> deserialize if qengine_is_qnnpack(): x = torch.rand(size=(1, weight.shape[1])) y1 = lin(x) y2 = lin2(x) self.assertEqual(y1, y2)
def test_sparse_qlinear(self): batch_size = 12 input_channels = 16 output_channels = 4 decimal_val = 4 row_block_size = 1 col_block_size = 4 # X86 implementation of sparse ops in qnnpack only support # block pattern 1x4. # arm kernels have support for both 1x4 and 8x1. # This distinction is only because x86 implementations exist # only to enable testing of integration path. # We do plan to add 8x1 as well so that testing does not have to # special case like this. At the moment it is deprioritized due # to other higher priority works. if qengine_is_qnnpack() and not (row_block_size == 1 and col_block_size == 4): return # ONEDNN does not support this yet if qengine_is_onednn(): return dense_prepack = torch.ops.quantized.linear_prepack dense_qlinear = torch.ops.quantized.linear dense_qlinear_dynamic = torch.ops.quantized.linear_dynamic sparse_prepack = torch.ops.sparse.qlinear_prepack sparse_qlinear = torch.ops.sparse.qlinear sparse_qlinear_dynamic = torch.ops.sparse.qlinear_dynamic X_scale = 0.2 X_zp = 2 X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32) float_bias = torch.randn(output_channels, dtype=torch.float32) W_scales = torch.rand(output_channels, dtype=torch.float32) W_zps = torch.zeros(output_channels, dtype=torch.int32) W_fp32 = torch.randn(output_channels, input_channels, dtype=torch.float32) with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()): X_q = torch.quantize_per_tensor(X_fp32, scale=X_scale, zero_point=X_zp, dtype=torch.quint8) for use_channelwise, dynamic_mode in product([True, False], [True, False]): if qengine_is_fbgemm() and dynamic_mode: logging.info( "dynamic sparse qlinear is only available in qnnpack") continue if qengine_is_qnnpack() and not dynamic_mode: logging.info( "static sparse qlinear is only available in fbgemm") continue if use_channelwise: W_q = torch.quantize_per_channel(W_fp32, scales=W_scales, zero_points=W_zps, axis=0, dtype=torch.qint8) else: W_q = torch.quantize_per_tensor(W_fp32, scale=W_scales[0], zero_point=W_zps[0], dtype=torch.qint8) Y_scale = 1.1234 Y_zp = 5 W_prepack_dense = dense_prepack(W_q, float_bias) W_prepack_sparse = sparse_prepack(W_q, float_bias, row_block_size, col_block_size) if dynamic_mode: Y = sparse_qlinear_dynamic(X_fp32, W_prepack_sparse) Y_ref = dense_qlinear_dynamic(X_fp32, W_prepack_dense) np.testing.assert_array_almost_equal(Y_ref.numpy(), Y.numpy(), decimal=decimal_val) else: Y_q = sparse_qlinear(X_q, W_prepack_sparse, Y_scale, Y_zp) Y_q_ref = dense_qlinear(X_q, W_prepack_dense, Y_scale, Y_zp) np.testing.assert_array_almost_equal( Y_q_ref.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val)
def _sparse_layer_test_helper( model_class, sparse_mapping, ref_mapping, qconfig_dict, fqn_to_check, test_class, test_scripting, ): # SET UP TEST PARAMETERS, INPUTS AND WEIGHTS # ------------------------------------------ batch_size = 12 input_channels = 4 output_channels = 7 model = model_class(input_channels, output_channels) # For sparse kernels both the activation and weight ZP = 0 X_scale = 0.2 X_zp = 2 W_scale = 1e-2 W_zp = 0 X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32) float_bias = torch.randn(output_channels, dtype=torch.float32) # generate a weight which we'll insert into the model W_fp32 = torch.randn(output_channels, input_channels, dtype=torch.float32) mask = torch.randint(0, 2, W_fp32.shape) W_fp32 *= mask with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()): X_q = torch.quantize_per_tensor(X_fp32, scale=X_scale, zero_point=X_zp, dtype=torch.quint8) X_fp32 = X_q.dequantize() W_q = torch.quantize_per_tensor(W_fp32, W_scale, W_zp, torch.qint8) # PREPARE MODELS FOR QUANTIZATION # ------------------------------- model.linear.weight = nn.Parameter(W_q.dequantize()) model.eval() # Add `sparse_params` to the model. The test for correct # sparse_param addition is in the sparsifier tests model.linear.sparse_params = {"sparse_block_shape": (1, 4)} # generate model versions qmodel = copy.deepcopy(model) sqmodel = copy.deepcopy(model) # generate model versions and apply qconfigs tq.propagate_qconfig_(qmodel, qconfig_dict) tq.propagate_qconfig_(sqmodel, qconfig_dict) tq.prepare(qmodel, inplace=True) tq.prepare(sqmodel, inplace=True) # calibrate with torch.no_grad(): qmodel(X_fp32) sqmodel(X_fp32) # ACTUAL TESTING BEGINS HERE # -------------------------- # Make sure the quantization parameters are computed the same way qparams = qmodel.linear.qconfig.weight().calculate_qparams() sqparams = sqmodel.linear.qconfig.weight().calculate_qparams() test_class.assertEqual(qparams, sqparams) sqmodule_to_check = fqn_to_module(sqmodel, fqn_to_check) sqmodule_start_class = sqmodule_to_check.__class__ sqmodule_expected_converted_class = sparse_mapping[ sqmodule_start_class] qmodule_to_check = fqn_to_module(qmodel, fqn_to_check) qmodule_start_class = qmodule_to_check.__class__ qmodule_expected_converted_class = ref_mapping[qmodule_start_class] # need to determine whether dynamic quantization is being performed since # input dtype will be different at the end is_dynamic = isinstance(qmodule_to_check.activation_post_process, tq.PlaceholderObserver) tq.convert(sqmodel, inplace=True, mapping=sparse_mapping) tq.convert(qmodel, inplace=True, mapping=ref_mapping) # this code is a duplicate of above since the references do not # update to the post-convert modules sqmodule_to_check = fqn_to_module(sqmodel, fqn_to_check) qmodule_to_check = fqn_to_module(qmodel, fqn_to_check) # check that the modules were converted as expected assert isinstance(sqmodule_to_check, sqmodule_expected_converted_class), "Convert failed" assert isinstance(qmodule_to_check, qmodule_expected_converted_class), "Mapping failed" row_block_size, col_block_size = sqmodel.linear._packed_params._weight_bias( )[2:] assert row_block_size == 1 and col_block_size == 4 # only run during serialization/deserialization tests # makes sure script/save/load doesn't malform the sqmodel if test_scripting: scripted_sqmodel = torch.jit.script(sqmodel) scripted_sqmodel.eval() buffer = io.BytesIO() torch.jit.save(scripted_sqmodel, buffer) buffer.seek(0) sqmodel = torch.jit.load(buffer) # use correct input dtype if is_dynamic: Y_ref = qmodel(X_fp32) Y_hat = sqmodel(X_fp32) test_class.assertEqual(Y_ref, Y_hat) else: Y_ref = qmodel(X_q) Y_hat = sqmodel(X_q) test_class.assertEqual(Y_ref.dequantize(), Y_hat.dequantize())
def test_sparse_qlinear_serdes(self): batch_size = 12 input_channels = 4 output_channels = 7 model = self.SparseQuantizedModel(input_channels, output_channels) # For sparse kernels both the activation and weight ZP = 0 X_scale = 0.2 X_zp = 0 W_scale = 1e-2 W_zp = 0 with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()): X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32) float_bias = torch.randn(output_channels, dtype=torch.float32) X_q = torch.quantize_per_tensor(X_fp32, scale=X_scale, zero_point=X_zp, dtype=torch.quint8) X_fp32 = X_q.dequantize() W_fp32 = torch.randn(output_channels, input_channels, dtype=torch.float32) mask = torch.randint(0, 2, W_fp32.shape) W_fp32 *= mask W_q = torch.quantize_per_tensor(W_fp32, W_scale, W_zp, torch.qint8) model.linear.weight = nn.Parameter(W_q.dequantize()) model.linear.sparse_params = {'sparse_block_shape': (1, 4)} model.eval() # Note: At the moment, for sparse kernels # fbgemm supports only static quantized sparse linear # qnnpack supports only dynamically quantized sparse linear # Hence we have two different tests. # fbgemm tests static flow, qnnpack tests dynamic. # Should be unified later on and tests should be fixed # appropriately. if qengine_is_fbgemm(): model.qconfig = tq.get_default_qconfig('fbgemm') qmodel = copy.deepcopy(model) sqmodel = copy.deepcopy(model) tq.prepare(qmodel, inplace=True) tq.prepare(sqmodel, inplace=True) with torch.no_grad(): qmodel(X_fp32) sqmodel(X_fp32) # Make sure the quantization parameters are computed the same way qparams = qmodel.linear.qconfig.weight().calculate_qparams() sqparams = sqmodel.linear.qconfig.weight().calculate_qparams() self.assertEqual(qparams, sqparams) # Make sure mapping of sparse kernels does not affect the non-sparse sparse_mapping = tq.get_default_static_quant_module_mappings() sparse_mapping[nn.Linear] = ao_nn_sq.Linear tq.convert(sqmodel, inplace=True, mapping=sparse_mapping) tq.convert(qmodel, inplace=True) assert isinstance(sqmodel.linear, ao_nn_sq.Linear), "Convert failed" assert isinstance(qmodel.linear, nn.quantized.Linear), "Mapping failed" scripted_sqmodel = torch.jit.script(sqmodel) scripted_sqmodel.eval() buffer = io.BytesIO() torch.jit.save(scripted_sqmodel, buffer) buffer.seek(0) sqmodel = torch.jit.load(buffer) # Make sure numerics are right Y_ref = qmodel(X_q) Y_hat = sqmodel(X_q) self.assertEqual(Y_ref.dequantize(), Y_hat.dequantize()) elif qengine_is_qnnpack(): qconfig = {nn.Linear: tq.qconfig.default_dynamic_qconfig} dqmodel = copy.deepcopy(model) sdqmodel = copy.deepcopy(model) tq.propagate_qconfig_(dqmodel, qconfig) tq.propagate_qconfig_(sdqmodel, qconfig) # Make sure the quantization parameters are computed the same way qparams = dqmodel.linear.qconfig.weight().calculate_qparams() sqparams = sdqmodel.linear.qconfig.weight().calculate_qparams() self.assertEqual(qparams, sqparams) # Make sure mapping of sparse kernels does not affect the non-sparse sparse_mapping = copy.deepcopy( tq.get_default_dynamic_quant_module_mappings()) sparse_mapping[nn.Linear] = ao_nn_sq.dynamic.Linear with LinearBlockSparsePattern(1, 4): tq.convert(sdqmodel, inplace=True, mapping=sparse_mapping) tq.convert( dqmodel, mapping=tq.get_default_dynamic_quant_module_mappings(), inplace=True) assert isinstance(sdqmodel.linear, ao_nn_sq.dynamic.Linear), "Convert failed" assert isinstance( dqmodel.linear, nn.quantized.dynamic.Linear), "Mapping failed" scripted_sdqmodel = torch.jit.script(sdqmodel) scripted_sdqmodel.eval() buffer = io.BytesIO() torch.jit.save(scripted_sdqmodel, buffer) buffer.seek(0) sdqmodel = torch.jit.load(buffer) # Make sure numerics are right Y_ref = dqmodel(X_fp32) Y_hat = sdqmodel(X_fp32) self.assertEqual(Y_ref, Y_hat)