def dynamic_quant_test( self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}, ): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "gemm_fp32.quant_dynamic_{}{}.onnx".format( activation_type_str, weight_type_str) quantize_dynamic( model_fp32_path, model_int8_path, weight_type=weight_type, extra_options=extra_options, ) quant_nodes = {"MatMulInteger": 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {"MatMulInteger": [["i", 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness( self, model_fp32_path, model_int8_path, {"input": np.random.rand(5, 10).astype(np.float32)}, )
def test_quantize_batch_size_1(self): batch = 1 hidden_size = 4 sequence_length = 4 model_f32_path = "test_embed_layer_norm_unit_test_batch1.onnx" model_uint8_path = "test_embed_layer_norm_unit_test_batch1_uint8.onnx" self.construct_model(batch, hidden_size, sequence_length, model_f32_path) data_reader = self.input_feeds_int32( 1, { "input_ids": [batch, sequence_length], "segment_ids": [batch, sequence_length], }, ) quantize_dynamic(model_f32_path, model_uint8_path) # Quantization should not have any DequantizeLinear nodes: qnode_counts = {"DequantizeLinear": 0, "QEmbedLayerNormalization": 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_f32_path, model_uint8_path, data_reader.get_next())
def test_quantize_resize(self): np.random.seed(1) model_fp32_path = 'resize_fp32.onnx' model_uint8_path = 'resize_uint8.onnx' model_uint8_qdq_path = 'resize_uint8_qdq.onnx' kwargs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'} self.construct_model_conv_resize(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], [1, 3, 48, 80], kwargs, [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 2.0, 2.0], None) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure resize become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "resize_node" or node.input[0] != 'conv_output')) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'Resize': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 3, 'Resize': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def test_quantize_avgpool(self): np.random.seed(1) model_fp32_path = 'avgpool_fp32.onnx' model_uint8_path = 'avgpool_uint8.onnx' model_uint8_qdq_path = 'avgpool_uint8_qdq.onnx' self.construct_model_conv_avgpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'QLinearAveragePool': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'AveragePool': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def verify_quantize_with_pad_mode(self, pad_mode, constant_value=None, quantize_mode='static'): np.random.seed(108) tag_pad_mode = pad_mode if pad_mode is not None else 'none' tag_constant_value = '' if constant_value is None else '_value' model_fp32_path = 'qop_pad_{}_fp32_{}{}.onnx'.format( quantize_mode, tag_pad_mode, tag_constant_value) model_i8_path = 'qop_pad_{}_i8_{}{}.onnx'.format( quantize_mode, tag_pad_mode, tag_constant_value) data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_conv_pad(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31], pad_mode, [0, 0, 1, 2, 0, 0, 3, 4], constant_value=constant_value) self.quantize_model(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader) data_reader.rewind() # DequantizeLinear=2 means there are one DequantizeLinear Node aftr both conv and pad, # which means pad node is running in quantized semantic. # In dynamic quantize mode, pad operator in fact not quantized as input is fp32. kwargs = { 'DynamicQuantizeLinear': 1 } if quantize_mode != 'static' else { 'DequantizeLinear': 2, 'QuantizeLinear': 1 } check_op_type_count(self, model_i8_path, **kwargs) check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next())
def dynamic_quant_conv(self, model_fp32_path, model_int8_path): quantize_dynamic(model_fp32_path, model_int8_path) quant_nodes = {'ConvInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})
def dynamic_quant_test(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_int8_path = 'gemm_fp32.quant_dynamic_{}{}.onnx'.format( activation_type_str, weight_type_str) quantize_dynamic(model_fp32_path, model_int8_path, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = {'MatMulInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {'MatMulInteger': [['i', 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(5, 10).astype(np.float32)})
def verify(self, per_channel): np.random.seed(1) model_fp32_path = 'conv_clip_fp32.{}.onnx'.format(per_channel) model_int8_qdq_path = 'conv_clip_quant_qdq.{}.onnx'.format(per_channel) model_int8_qop_path = 'conv_clip_quant_qop.{}.onnx'.format(per_channel) data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_conv_clip(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [15376]) quantize_static(model_fp32_path, model_int8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() #topo sort check check_op_type_order(self, model_int8_qdq_path, ['DequantizeLinear', 'QuantizeLinear', 'DequantizeLinear', 'Conv', 'QuantizeLinear', 'DequantizeLinear', 'Reshape', 'QuantizeLinear', 'DequantizeLinear']) check_model_correctness(self, model_fp32_path, model_int8_qdq_path, data_reader.get_next()) data_reader.rewind() quantize_static(model_fp32_path, model_int8_qop_path, data_reader, quant_format=QuantFormat.QOperator, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() qop_nodes = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_int8_qop_path, **qop_nodes) check_model_correctness(self, model_fp32_path, model_int8_qop_path, data_reader.get_next())
def verify_quantize_conv(self, has_bias, per_channel): np.random.seed(1) model_fp32_path = 'conv_fp32.{}.{}.onnx'.format(has_bias, per_channel) model_int8_qdq_path = 'conv_quant_qdq.{}.{}.onnx'.format(has_bias, per_channel) model_int8_qop_path = 'conv_quant_qop.{}.{}.onnx'.format(has_bias, per_channel) data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_conv(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31], has_bias) quantize_static(model_fp32_path, model_int8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() qdq_nodes = {'Conv': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 4 if has_bias else 3} check_op_type_count(self, model_int8_qdq_path, **qdq_nodes) check_model_correctness(self, model_fp32_path, model_int8_qdq_path, data_reader.get_next()) data_reader.rewind() quantize_static(model_fp32_path, model_int8_qop_path, data_reader, quant_format=QuantFormat.QOperator, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() qop_nodes = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_int8_qop_path, **qop_nodes) check_model_correctness(self, model_fp32_path, model_int8_qop_path, data_reader.get_next())
def dynamic_quant_conv_test(self, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = "conv_bias.fp32.onnx" self.construct_model(model_fp32_path) activation_proto_qtype = TensorProto.UINT8 activation_type_str = "u8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "conv_bias.quant.{}{}.onnx".format( activation_type_str, weight_type_str) quantize_dynamic( model_fp32_path, model_int8_path, weight_type=weight_type, extra_options=extra_options, ) quant_nodes = {"ConvInteger": 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {"ConvInteger": [["i", 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) check_model_correctness( self, model_fp32_path, model_int8_path, {"input": np.random.rand(4, 2, 8, 8).astype(np.float32)}, )
def dynamic_quant_conv_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = 'conv_bias.fp32.onnx' self.construct_model(model_fp32_path) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_int8_path = 'conv_bias.quant.{}{}.onnx'.format( activation_type_str, weight_type_str) quantize_dynamic(model_fp32_path, model_int8_path, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = {'ConvInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {'ConvInteger': [['i', 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})
def run_quantize_squeezes_of_opset(self, opset = 13): np.random.seed(1) model_fp32_path = 'squeezes_opset{}_fp32.onnx'.format(opset) model_uint8_path = 'squeezes_opset{}_uint8.onnx'.format(opset) model_uint8_qdq_path = 'squeezes_opset{}_uint8_qdq.onnx'.format(opset) self.construct_model_conv_squeezes(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], opset=opset) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure squeezes become xint8 operator, its input name could tell that qnode_counts = {'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next(), rtol=0.01, atol=0.5) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 8, 'DequantizeLinear': 11} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next(), rtol=0.01, atol=0.5)
def test_quantize_batch_size_2(self): batch = 2 hidden_size = 4 sequence_length = 4 model_f32_path = 'test_embed_layer_norm_unit_test_batch2.onnx' model_uint8_path = 'test_embed_layer_norm_unit_test_batch2_uint8.onnx' self.construct_model(batch, hidden_size, sequence_length, model_f32_path) data_reader = self.input_feeds_int32( 1, { 'input_ids': [batch, sequence_length], 'segment_ids': [batch, sequence_length] }) quantize_dynamic(model_f32_path, model_uint8_path) # Quantization should not have any DequantizeLinear nodes: qnode_counts = {'DequantizeLinear': 0, 'QEmbedLayerNormalization': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_f32_path, model_uint8_path, data_reader.get_next())
def test_quantize_concat(self): np.random.seed(1) model_fp32_path = 'concat_fp32.onnx' model_uint8_path = 'concat_uint8.onnx' model_uint8_qdq_path = 'concat_uint8_qdq.onnx' self.construct_model(model_fp32_path) # Verify QOperator mode data_reader = InputFeedsNegOneZeroOne(1, {'input': [1, 3, 15, 15]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) qnode_counts = {'QLinearConv': 3, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'QLinearConcat': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 5, 'DequantizeLinear': 8, 'Concat': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def test_quantize_transpose(self): np.random.seed(1) model_fp32_path = 'transpose_fp32.onnx' model_uint8_path = 'transpose_uint8.onnx' model_uint8_qdq_path = 'transpose_uint8_qdq.onnx' self.construct_model_matmul_transpose(model_fp32_path, [3, 7], [7, 5], [5, 3]) # Verify QOperator model data_reader = self.input_feeds(1, {'input': [3, 7]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure transpose become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "transpose_node" or node.input[0] != 'matmul_output')) qnode_counts = {'QLinearMatMul': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'Transpose': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ model data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'MatMul': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 3, 'Transpose': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def static_quant_test_qdq(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_int8_path = 'gemm_fp32.quant_dqd_{}{}.onnx'.format( activation_type_str, weight_type_str) data_reader.rewind() quantize_static(model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = {'Gemm': 2, 'QuantizeLinear': 3, 'DequantizeLinear': 7} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = { 'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]] } check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def verify(self, per_channel, is_quant_type_int8): np.random.seed(1) model_fp32_path = "conv_relu_fp32.{}.onnx".format(per_channel) model_int8_qdq_path = "conv_relu_quant_qdq.{}.onnx".format(per_channel) model_int8_qop_path = "conv_relu_quant_qop.{}.onnx".format(per_channel) data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]}) self.construct_model_conv_relu(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31]) quantize_static( model_fp32_path, model_int8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, per_channel=per_channel, reduce_range=per_channel, activation_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, weight_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, ) data_reader.rewind() # topo sort check check_op_type_order( self, model_int8_qdq_path, [ "DequantizeLinear", "QuantizeLinear", "DequantizeLinear", "Conv", "QuantizeLinear", "DequantizeLinear", ], ) check_model_correctness(self, model_fp32_path, model_int8_qdq_path, data_reader.get_next()) data_reader.rewind() quantize_static( model_fp32_path, model_int8_qop_path, data_reader, quant_format=QuantFormat.QOperator, per_channel=per_channel, reduce_range=per_channel, activation_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, weight_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, ) data_reader.rewind() qop_nodes = { "QLinearConv": 1, "QuantizeLinear": 1, "DequantizeLinear": 1 } check_op_type_count(self, model_int8_qop_path, **qop_nodes) check_model_correctness(self, model_fp32_path, model_int8_qop_path, data_reader.get_next())
def dynamic_attention_quant_test(self, model_fp32_path, model_int8_path, per_channel, reduce_range): quantize_dynamic(model_fp32_path, model_int8_path, per_channel=per_channel, reduce_range=reduce_range) quant_nodes = {'QAttention': 1, 'MatMulInteger': 1} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(1, 5, 10).astype(np.float32)})
def static_quant_test(self, model_fp32_path, model_int8_path): data_reader = self.input_feeds(1, {'input': [5, 10]}) quantize_static(model_fp32_path, model_int8_path, data_reader) data_reader.rewind() quant_nodes = {'QLinearMatMul' : 2, 'QLinearAdd' : 2, 'QuantizeLinear' : 1, 'DequantizeLinear' : 1} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = "gavg_pool_fp32.onnx" data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]}) self.construct_model_gavgpool(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 1, 1]) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_q8_path = "gavg_pool_{}{}.onnx".format(activation_type_str, weight_type_str) data_reader.rewind() quantize_static( model_fp32_path, model_q8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) quant_nodes = { "QLinearConv": 1, "GlobalAveragePool": 1, "QLinearGlobalAveragePool": 1, "QuantizeLinear": 1, "DequantizeLinear": 1, } check_op_type_count(self, model_q8_path, **quant_nodes) qnode_io_qtypes = { "QuantizeLinear": [ ["i", 2, activation_proto_qtype], ["o", 0, activation_proto_qtype], ] } qnode_io_qtypes.update({ "QLinearGlobalAveragePool": [ ["i", 2, activation_proto_qtype], ["i", 4, activation_proto_qtype], ] }) check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())
def static_quant_test_qdq(self, model_fp32_path, model_int8_path): data_reader = self.input_feeds(1, {'input': [5, 10]}) quantize_static(model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ) data_reader.rewind() quant_nodes = {'MatMul' : 2, 'Add' : 2, 'QuantizeLinear' : 4, 'DequantizeLinear' : 8} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def test_quantize_maxpool(self): np.random.seed(1) model_fp32_path = 'maxpool_fp32.onnx' model_uint8_path = 'maxpool_uint8.onnx' model_uint8_qdq_path = 'maxpool_uint8_qdq.onnx' self.construct_model_conv_maxpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure maxpool become xint8 operator, its input name could tell that check_op_nodes( self, model_uint8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output')) qnode_counts = { 'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1 } check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = { 'Conv': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 3, 'MaxPool': 1 } check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def static_quant_test( self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}, ): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "relu_fp32.quant_{}{}.onnx".format( activation_type_str, weight_type_str) data_reader.rewind() quantize_static( model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) qdq_count = 1 if activation_type == QuantType.QUInt8 else 2 relu_count = 0 if activation_type == QuantType.QUInt8 else 1 quant_nodes = { "QGemm": 2, "QuantizeLinear": qdq_count, "DequantizeLinear": qdq_count, "Relu": relu_count } check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = { "QuantizeLinear": [ ["i", 2, activation_proto_qtype], ["o", 0, activation_proto_qtype], ] } qnode_io_qtypes.update( {"DequantizeLinear": [["i", 2, activation_proto_qtype]]}) check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def static_quant_test_qdq( self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}, ): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "gemm_fp32.quant_dqd_{}{}.onnx".format( activation_type_str, weight_type_str) data_reader.rewind() quantize_static( model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) clip_count = 0 if activation_type == QuantType.QUInt8 else 1 q_count = 3 if activation_type == QuantType.QUInt8 else 4 dq_count = 7 if activation_type == QuantType.QUInt8 else 8 quant_nodes = { "Gemm": 2, "QuantizeLinear": q_count, "DequantizeLinear": dq_count, "Clip": clip_count } check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = { "QuantizeLinear": [ ["i", 2, activation_proto_qtype], ["o", 0, activation_proto_qtype], ] } check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = 'gavg_pool_fp32.onnx' data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_gavgpool(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 1, 1]) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_q8_path = 'gavg_pool_{}{}.onnx'.format(activation_type_str, weight_type_str) data_reader.rewind() quantize_static(model_fp32_path, model_q8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = { 'QLinearConv': 1, 'GlobalAveragePool': 1, 'QLinearGlobalAveragePool': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1 } check_op_type_count(self, model_q8_path, **quant_nodes) qnode_io_qtypes = { 'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]] } qnode_io_qtypes.update({ 'QLinearGlobalAveragePool': [['i', 2, activation_proto_qtype], ['i', 4, activation_proto_qtype]] }) check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())
def test_activation_only(self): float_model_path = str( Path(self._tmp_model_dir.name) / "float_relu_convs_model.onnx") self.construct_model_clip_relu(float_model_path, [1, 3, 1, 3], [1, 3, 1, 3]) data_reader = self.input_feeds(2, {"input": [1, 3, 1, 3]}) qdq_model_path = str( Path(self._tmp_model_dir.name) / "qdq_relu_convs_model.onnx") quantize_static(float_model_path, qdq_model_path, data_reader) qop_nodes = { "Clip": 1, "Relu": 1, "QuantizeLinear": 0, "DequantizeLinear": 0 } check_op_type_count(self, qdq_model_path, **qop_nodes)
def verify_should_not_trigger(self, quantize_mode='static'): np.random.seed(108) model_fp32_path = 'qop_pad_notrigger_fp32_{}.onnx'.format( quantize_mode) model_i8_path = 'qop_pad_notrigger_i8_{}.onnx'.format(quantize_mode) data_reader = self.input_feeds(1, {'input': [1, 16, 31, 31]}) self.construct_model_pad(model_fp32_path, 'constant', [1, 16, 31, 31], [0, 0, 1, 2, 0, 0, 3, 4]) self.quantize_model(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader) data_reader.rewind() # DequantizeLinear=0 pad node is not been quantized as input is not quantized. check_op_type_count(self, model_i8_path, DynamicQuantizeLinear=0, QuantizeLinear=0, DequantizeLinear=0) check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next())
def test_quantize_reshape(self): np.random.seed(1) model_fp32_path = 'gavg_pool_fp32.onnx' model_int8_path = 'gavg_pool_fp32.quant.onnx' data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_gavgpool(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 1, 1]) quantize_static(model_fp32_path, model_int8_path, data_reader) data_reader.rewind() quant_nodes = {'QLinearConv' : 1, 'GlobalAveragePool' : 1, 'QLinearGlobalAveragePool' : 1, 'QuantizeLinear' : 1, 'DequantizeLinear' : 1} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def quantize_resize_test(self, activation_type, weight_type, extra_options = {}): np.random.seed(1) model_fp32_path = 'resize_fp32.onnx' kwargs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'} self.construct_model_conv_resize(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], [1, 3, 48, 80], kwargs, [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 2.0, 2.0], None) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_uint8_path = 'resize_{}{}.onnx'.format(activation_type_str, weight_type_str) model_uint8_qdq_path = 'resize_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader, activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) # make sure resize become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "resize_node" or node.input[0] != 'conv_output')) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'Resize': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]}) check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Resize': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def quantize_maxpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = 'maxpool_fp32.onnx' self.construct_model_conv_maxpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_q8_path = 'maxpool_{}{}.onnx'.format(activation_type_str, weight_type_str) model_q8_qdq_path = 'maxpool_dqd_{}{}.onnx'.format(activation_type_str, weight_type_str) # Verify QOperator mode data_reader.rewind() quantize_static(model_fp32_path, model_q8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) # make sure maxpool become xint8 operator, its input name could tell that check_op_nodes(self, model_q8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output')) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1} check_op_type_count(self, model_q8_path, **qnode_counts) qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'MaxPool': 1} check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts) qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next())