def test_quantize_concat(self): np.random.seed(1) model_fp32_path = 'concat_fp32.onnx' model_uint8_path = 'concat_uint8.onnx' model_uint8_qdq_path = 'concat_uint8_qdq.onnx' self.construct_model(model_fp32_path) # Verify QOperator mode data_reader = InputFeedsNegOneZeroOne(1, {'input': [1, 3, 15, 15]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) qnode_counts = {'QLinearConv': 3, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'QLinearConcat': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 5, 'DequantizeLinear': 8, 'Concat': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def run_quantize_squeezes_of_opset(self, opset = 13): np.random.seed(1) model_fp32_path = 'squeezes_opset{}_fp32.onnx'.format(opset) model_uint8_path = 'squeezes_opset{}_uint8.onnx'.format(opset) model_uint8_qdq_path = 'squeezes_opset{}_uint8_qdq.onnx'.format(opset) self.construct_model_conv_squeezes(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], opset=opset) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure squeezes become xint8 operator, its input name could tell that qnode_counts = {'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next(), rtol=0.01, atol=0.5) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 8, 'DequantizeLinear': 11} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next(), rtol=0.01, atol=0.5)
def perform_quantization(self, activations, weight, act_sym, wgt_sym): # One-layer convolution model act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape) wgt = helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape) res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None]) wgt_init = numpy_helper.from_array(weight, "WGT") conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"]) graph = helper.make_graph([conv_node], "test", [act], [res], initializer=[wgt_init]) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)]) onnx.save(model, "model.onnx") # Quantize model class DummyDataReader(quantization.CalibrationDataReader): def __init__(self): self.iterator = ({"ACT": act} for act in activations) def get_next(self): return next(self.iterator, None) quantization.quantize_static( model_input="model.onnx", model_output="quantized-model.onnx", calibration_data_reader=DummyDataReader(), quant_format=quantization.QuantFormat.QOperator, activation_type=quantization.QuantType.QInt8, weight_type=quantization.QuantType.QInt8, op_types_to_quantize=["Conv", "MatMul"], extra_options={ "WeightSymmetric": wgt_sym, "ActivationSymmetric": act_sym }) # Extract quantization parameters: scales and zero points for activations, weights, and results model = onnx.load("quantized-model.onnx") act_zp = [ init for init in model.graph.initializer if init.name == "ACT_zero_point" ][0].int32_data[0] act_sc = [ init for init in model.graph.initializer if init.name == "ACT_scale" ][0].float_data[0] wgt_zp = [ init for init in model.graph.initializer if init.name == "WGT_zero_point" ][0].int32_data[0] wgt_sc = [ init for init in model.graph.initializer if init.name == "WGT_scale" ][0].float_data[0] # Return quantization parameters return act_zp, act_sc, wgt_zp, wgt_sc
def main(): input_model_path = './resnet50_v1.onnx' output_model_path = './calibrated_quantized_model.onnx' calibration_dataset_path = './test_images' dr = ResNet50DataReader(calibration_dataset_path) quantize_static(input_model_path, output_model_path, dr) print('Calibrated and quantized model saved.')
def test_quantize_resize(self): np.random.seed(1) model_fp32_path = 'resize_fp32.onnx' model_uint8_path = 'resize_uint8.onnx' model_uint8_qdq_path = 'resize_uint8_qdq.onnx' kwargs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'} self.construct_model_conv_resize(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], [1, 3, 48, 80], kwargs, [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 2.0, 2.0], None) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure resize become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "resize_node" or node.input[0] != 'conv_output')) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'Resize': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 3, 'Resize': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def test_quantize_avgpool(self): np.random.seed(1) model_fp32_path = 'avgpool_fp32.onnx' model_uint8_path = 'avgpool_uint8.onnx' model_uint8_qdq_path = 'avgpool_uint8_qdq.onnx' self.construct_model_conv_avgpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'QLinearAveragePool': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'AveragePool': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def test_quantize_transpose(self): np.random.seed(1) model_fp32_path = 'transpose_fp32.onnx' model_uint8_path = 'transpose_uint8.onnx' model_uint8_qdq_path = 'transpose_uint8_qdq.onnx' self.construct_model_matmul_transpose(model_fp32_path, [3, 7], [7, 5], [5, 3]) # Verify QOperator model data_reader = self.input_feeds(1, {'input': [3, 7]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure transpose become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "transpose_node" or node.input[0] != 'matmul_output')) qnode_counts = {'QLinearMatMul': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'Transpose': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ model data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = {'MatMul': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 3, 'Transpose': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def static_quant_test_qdq(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_int8_path = 'gemm_fp32.quant_dqd_{}{}.onnx'.format( activation_type_str, weight_type_str) data_reader.rewind() quantize_static(model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = {'Gemm': 2, 'QuantizeLinear': 3, 'DequantizeLinear': 7} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = { 'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]] } check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def verify(self, per_channel): np.random.seed(1) model_fp32_path = 'conv_clip_fp32.{}.onnx'.format(per_channel) model_int8_qdq_path = 'conv_clip_quant_qdq.{}.onnx'.format(per_channel) model_int8_qop_path = 'conv_clip_quant_qop.{}.onnx'.format(per_channel) data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_conv_clip(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [15376]) quantize_static(model_fp32_path, model_int8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() #topo sort check check_op_type_order(self, model_int8_qdq_path, ['DequantizeLinear', 'QuantizeLinear', 'DequantizeLinear', 'Conv', 'QuantizeLinear', 'DequantizeLinear', 'Reshape', 'QuantizeLinear', 'DequantizeLinear']) check_model_correctness(self, model_fp32_path, model_int8_qdq_path, data_reader.get_next()) data_reader.rewind() quantize_static(model_fp32_path, model_int8_qop_path, data_reader, quant_format=QuantFormat.QOperator, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() qop_nodes = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_int8_qop_path, **qop_nodes) check_model_correctness(self, model_fp32_path, model_int8_qop_path, data_reader.get_next())
def quantize_model( self, model_fp32_path, model_i8_path, data_reader=None, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}, ): if data_reader is not None: quantize_static( model_fp32_path, model_i8_path, data_reader, reduce_range=True, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) else: quantize_dynamic( model_fp32_path, model_i8_path, reduce_range=True, weight_type=weight_type, extra_options=extra_options, )
def test_create_weight_matching(self): # Setup: create float model: float_model_path = str(Path(self._tmp_model_dir.name) / "float_model3.onnx") construct_test_model1(float_model_path, activations_as_outputs=False) # Setup: create qdq model: data_reader = TestDataReader() qdq_model_path = str(Path(self._tmp_model_dir.name) / "qdq_model3.onnx") quantize_static( float_model_path, qdq_model_path, data_reader, quant_format=QuantFormat.QDQ, per_channel=False, reduce_range=False, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, ) # Call function under test and verify all weights are present matched_weights = create_weight_matching(float_model_path, qdq_model_path) weight_names = ["W1", "W3", "W5", "B1", "B3", "B5"] for weight_name in weight_names: float_array = matched_weights[weight_name]["float"] dq_array = matched_weights[weight_name]["dequantized"] self.assertEqual(float_array.shape, dq_array.shape) weights_error = compute_weight_error(matched_weights) for weight_name in weight_names: self.assertGreater( weights_error[weight_name], 0.1, f"{weight_name} quantization error {weights_error[weight_name]} too big!", )
def verify_quantize_conv(self, has_bias, per_channel): np.random.seed(1) model_fp32_path = 'conv_fp32.{}.{}.onnx'.format(has_bias, per_channel) model_int8_qdq_path = 'conv_quant_qdq.{}.{}.onnx'.format(has_bias, per_channel) model_int8_qop_path = 'conv_quant_qop.{}.{}.onnx'.format(has_bias, per_channel) data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_conv(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31], has_bias) quantize_static(model_fp32_path, model_int8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() qdq_nodes = {'Conv': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 4 if has_bias else 3} check_op_type_count(self, model_int8_qdq_path, **qdq_nodes) check_model_correctness(self, model_fp32_path, model_int8_qdq_path, data_reader.get_next()) data_reader.rewind() quantize_static(model_fp32_path, model_int8_qop_path, data_reader, quant_format=QuantFormat.QOperator, per_channel = per_channel, reduce_range = per_channel ) data_reader.rewind() qop_nodes = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_int8_qop_path, **qop_nodes) check_model_correctness(self, model_fp32_path, model_int8_qop_path, data_reader.get_next())
def test_create_weight_matching_per_channel(self): # float model # (input) # | # Add # / | \ # MatMul MatMul MatMul # | | | # (output)(output)(output) float_model_path = str(Path(self._tmp_model_dir.name) / "float_model4.onnx") initializers = [] input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [5, 5]) output_tensor1 = helper.make_tensor_value_info("M", TensorProto.FLOAT, [5, 5]) output_tensor2 = helper.make_tensor_value_info("N", TensorProto.FLOAT, [5, 5]) output_tensor3 = helper.make_tensor_value_info("O", TensorProto.FLOAT, [5, 5]) add_weight_data = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(add_weight_data, name="P")) matmul_weight_data_1 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_1, name="Q")) matmul_weight_data_2 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="R")) initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="S")) add_node = onnx.helper.make_node("Add", ["input", "P"], ["T"], name="Add") matmul_node_1 = onnx.helper.make_node("MatMul", ["T", "Q"], ["M"], name="MatMul1") matmul_node_2 = onnx.helper.make_node("MatMul", ["T", "R"], ["N"], name="MatMul2") matmul_node_3 = onnx.helper.make_node("MatMul", ["T", "S"], ["O"], name="MatMul3") graph = helper.make_graph( [add_node, matmul_node_1, matmul_node_2, matmul_node_3], "QDQ_Test", [input_tensor], [output_tensor1, output_tensor2, output_tensor3], initializer=initializers, ) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) onnx.save(model, float_model_path) # Setup: create qdq model: qdq_model_path = str(Path(self._tmp_model_dir.name) / "qdq_model4.onnx") quantize_static( float_model_path, qdq_model_path, TestDataReader([5, 5]), quant_format=QuantFormat.QDQ, per_channel=True, reduce_range=False, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, ) # Call function under test and verify all weights are present matched_weights = create_weight_matching(float_model_path, qdq_model_path) weight_names = ["P", "Q", "R", "S"] for weight_name in weight_names: float_array = matched_weights[weight_name]["float"] dq_array = matched_weights[weight_name]["dequantized"] self.assertEqual(float_array.shape, dq_array.shape)
def deploy_onnx_quantized(dataloader: DataLoader, model: nn.Module, fuse: bool, name: str): model = deepcopy(model) path = f'./{name}' model = model.eval() if fuse: model.fuse() path += '_fused' float_path = path + '_float.onnx' quantized_path = path + '_quant.onnx' example_input = torch.rand(1, 3, 224, 224) onnx.export(model=model, args=(example_input, ), f=float_path, input_names=['input_image'], output_names=['logits'], opset_version=12) onnx_q_loader = ONNXQuantizationDataReader(quant_loader=dataloader, input_name='input_image') quantize_static(model_input=float_path, model_output=quantized_path, calibration_data_reader=onnx_q_loader) avg_time = benchmark_onnx_model(rt.InferenceSession(float_path)) size = Path(float_path).stat().st_size / 1e6 print( f'Benchmarking {float_path}: Avg. inference@CPU: {avg_time:3.2f} ms, Size: {size:2.2f} MB' ) avg_time = benchmark_onnx_model(rt.InferenceSession(quantized_path)) size = Path(quantized_path).stat().st_size / 1e6 print( f'Benchmarking {quantized_path}: Avg. inference@CPU: {avg_time:3.2f} ms, Size: {size:2.2f} MB' )
def verify(self, per_channel, is_quant_type_int8): np.random.seed(1) model_fp32_path = "conv_relu_fp32.{}.onnx".format(per_channel) model_int8_qdq_path = "conv_relu_quant_qdq.{}.onnx".format(per_channel) model_int8_qop_path = "conv_relu_quant_qop.{}.onnx".format(per_channel) data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]}) self.construct_model_conv_relu(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31]) quantize_static( model_fp32_path, model_int8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, per_channel=per_channel, reduce_range=per_channel, activation_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, weight_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, ) data_reader.rewind() # topo sort check check_op_type_order( self, model_int8_qdq_path, [ "DequantizeLinear", "QuantizeLinear", "DequantizeLinear", "Conv", "QuantizeLinear", "DequantizeLinear", ], ) check_model_correctness(self, model_fp32_path, model_int8_qdq_path, data_reader.get_next()) data_reader.rewind() quantize_static( model_fp32_path, model_int8_qop_path, data_reader, quant_format=QuantFormat.QOperator, per_channel=per_channel, reduce_range=per_channel, activation_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, weight_type=QuantType.QInt8 if is_quant_type_int8 else QuantType.QUInt8, ) data_reader.rewind() qop_nodes = { "QLinearConv": 1, "QuantizeLinear": 1, "DequantizeLinear": 1 } check_op_type_count(self, model_int8_qop_path, **qop_nodes) check_model_correctness(self, model_fp32_path, model_int8_qop_path, data_reader.get_next())
def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None): if data_reader is not None: quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True) else: quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True)
def test_create_activation_matching_present(self): float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx") construct_test_model1(float_model_path, activations_as_outputs=False) data_reader = TestDataReader() qdq_model_path = str(Path(self._tmp_model_dir.name) / "qdq_model2.onnx") quantize_static( float_model_path, qdq_model_path, data_reader, quant_format=QuantFormat.QDQ, per_channel=False, reduce_range=False, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, ) data_reader.rewind() augmented_float_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_float_model2.onnx")) float_activations = augment_model_collect_activations(float_model_path, augmented_float_model_path, data_reader) data_reader.rewind() augmented_qdq_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_qdq_model2.onnx")) qdq_activations = augment_model_collect_activations(qdq_model_path, augmented_qdq_model_path, data_reader) compare_dict = create_activation_matching(qdq_activations, float_activations) # 'Conv1Out' is combined with 'Relu2Out' tensor_names = [ "Relu1Out", "Relu2Out", "Conv2Out", "Conv3Out", "AddOut", ] for tensor_name in tensor_names: self.assertTrue(compare_dict[tensor_name]["float"]) self.assertTrue(compare_dict[tensor_name]["pre_qdq"]) self.assertTrue(compare_dict[tensor_name]["post_qdq"]) self.assertFalse(compare_dict.get("Conv1Out")) activations_error = compute_activation_error(compare_dict) for tensor_name in tensor_names: self.assertGreater( activations_error[tensor_name]["xmodel_err"], 0.01, f"{tensor_name} cross model error {activations_error[tensor_name]['xmodel_err']} exceeds threashold.", ) self.assertGreater( activations_error[tensor_name]["qdq_err"], 0.01, f"{tensor_name} qdq error {activations_error[tensor_name]['qdq_err']} exceeds threashold.", )
def static_quant_test(self, model_fp32_path, model_int8_path): data_reader = self.input_feeds(1, {'input': [5, 10]}) quantize_static(model_fp32_path, model_int8_path, data_reader) data_reader.rewind() quant_nodes = {'QLinearMatMul' : 2, 'QLinearAdd' : 2, 'QuantizeLinear' : 1, 'DequantizeLinear' : 1} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = "gavg_pool_fp32.onnx" data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]}) self.construct_model_gavgpool(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 1, 1]) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_q8_path = "gavg_pool_{}{}.onnx".format(activation_type_str, weight_type_str) data_reader.rewind() quantize_static( model_fp32_path, model_q8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) quant_nodes = { "QLinearConv": 1, "GlobalAveragePool": 1, "QLinearGlobalAveragePool": 1, "QuantizeLinear": 1, "DequantizeLinear": 1, } check_op_type_count(self, model_q8_path, **quant_nodes) qnode_io_qtypes = { "QuantizeLinear": [ ["i", 2, activation_proto_qtype], ["o", 0, activation_proto_qtype], ] } qnode_io_qtypes.update({ "QLinearGlobalAveragePool": [ ["i", 2, activation_proto_qtype], ["i", 4, activation_proto_qtype], ] }) check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())
def static_quant_test_qdq(self, model_fp32_path, model_int8_path): data_reader = self.input_feeds(1, {'input': [5, 10]}) quantize_static(model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ) data_reader.rewind() quant_nodes = {'MatMul' : 2, 'Add' : 2, 'QuantizeLinear' : 4, 'DequantizeLinear' : 8} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def main(): args = get_args() input_model_path = args.input_model output_model_path = args.output_model calibration_dataset_path = args.calibrate_dataset dr = ResNet50DataReader(calibration_dataset_path) quantize_static(input_model_path, output_model_path, dr) print('Calibrated and quantized model saved.') print('benchmarking fp32 model...') benchmark(input_model_path) print('benchmarking int8 model...') benchmark(output_model_path)
def static_quant_test( self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}, ): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "relu_fp32.quant_{}{}.onnx".format( activation_type_str, weight_type_str) data_reader.rewind() quantize_static( model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) qdq_count = 1 if activation_type == QuantType.QUInt8 else 2 relu_count = 0 if activation_type == QuantType.QUInt8 else 1 quant_nodes = { "QGemm": 2, "QuantizeLinear": qdq_count, "DequantizeLinear": qdq_count, "Relu": relu_count } check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = { "QuantizeLinear": [ ["i", 2, activation_proto_qtype], ["o", 0, activation_proto_qtype], ] } qnode_io_qtypes.update( {"DequantizeLinear": [["i", 2, activation_proto_qtype]]}) check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def test_quantize_maxpool(self): np.random.seed(1) model_fp32_path = 'maxpool_fp32.onnx' model_uint8_path = 'maxpool_uint8.onnx' model_uint8_qdq_path = 'maxpool_uint8_qdq.onnx' self.construct_model_conv_maxpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) quantize_static(model_fp32_path, model_uint8_path, data_reader) # make sure maxpool become xint8 operator, its input name could tell that check_op_nodes( self, model_uint8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output')) qnode_counts = { 'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1 } check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) qdqnode_counts = { 'Conv': 1, 'QuantizeLinear': 2, 'DequantizeLinear': 3, 'MaxPool': 1 } check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
def static_quant_test_qdq( self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}, ): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "gemm_fp32.quant_dqd_{}{}.onnx".format( activation_type_str, weight_type_str) data_reader.rewind() quantize_static( model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) clip_count = 0 if activation_type == QuantType.QUInt8 else 1 q_count = 3 if activation_type == QuantType.QUInt8 else 4 dq_count = 7 if activation_type == QuantType.QUInt8 else 8 quant_nodes = { "Gemm": 2, "QuantizeLinear": q_count, "DequantizeLinear": dq_count, "Clip": clip_count } check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = { "QuantizeLinear": [ ["i", 2, activation_proto_qtype], ["o", 0, activation_proto_qtype], ] } check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = 'gavg_pool_fp32.onnx' data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_gavgpool(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 1, 1]) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_q8_path = 'gavg_pool_{}{}.onnx'.format(activation_type_str, weight_type_str) data_reader.rewind() quantize_static(model_fp32_path, model_q8_path, data_reader, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = { 'QLinearConv': 1, 'GlobalAveragePool': 1, 'QLinearGlobalAveragePool': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1 } check_op_type_count(self, model_q8_path, **quant_nodes) qnode_io_qtypes = { 'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]] } qnode_io_qtypes.update({ 'QLinearGlobalAveragePool': [['i', 2, activation_proto_qtype], ['i', 4, activation_proto_qtype]] }) check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())
def test_quantize_relu_conv(self): float_model_path = str( Path(self._tmp_model_dir.name) / "float_relu_convs_model.onnx") construct_relu_conv_model(float_model_path) data_reader = self.input_feeds(2, {"input": [1, 3, 1, 3]}) qdq_model_path = str( Path(self._tmp_model_dir.name) / "qdq_relu_convs_model.onnx") quantize_static( float_model_path, qdq_model_path, data_reader, quant_format=QuantFormat.QDQ, per_channel=False, reduce_range=False, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, )
def test_activation_only(self): float_model_path = str( Path(self._tmp_model_dir.name) / "float_relu_convs_model.onnx") self.construct_model_clip_relu(float_model_path, [1, 3, 1, 3], [1, 3, 1, 3]) data_reader = self.input_feeds(2, {"input": [1, 3, 1, 3]}) qdq_model_path = str( Path(self._tmp_model_dir.name) / "qdq_relu_convs_model.onnx") quantize_static(float_model_path, qdq_model_path, data_reader) qop_nodes = { "Clip": 1, "Relu": 1, "QuantizeLinear": 0, "DequantizeLinear": 0 } check_op_type_count(self, qdq_model_path, **qop_nodes)
def main(): args = get_args() input_model_path = args.input_model output_model_path = args.output_model calibration_dataset_path = args.calibrate_dataset dr = ResNet50DataReader(calibration_dataset_path) quantize_static(input_model_path, output_model_path, dr, quant_format=args.quant_format, per_channel=args.per_channel, weight_type=QuantType.QInt8) print('Calibrated and quantized model saved.') print('benchmarking fp32 model...') benchmark(input_model_path) print('benchmarking int8 model...') benchmark(output_model_path)
def quantize_and_save_model(name, input, model, act_type="uint8", wt_type="uint8", per_channel=False): float_model_path = os.path.join("models", "dummy.onnx") quantized_model_path = os.path.join("models", name + ".onnx") type_dict = {"uint8": QuantType.QUInt8, "int8": QuantType.QInt8} model.eval() torch.onnx.export(model, input, float_model_path, export_params=True, opset_version=12) dr = DataReader(float_model_path) quantize_static(float_model_path, quantized_model_path, dr, per_channel=per_channel, activation_type=type_dict[act_type], weight_type=type_dict[wt_type]) os.remove(float_model_path) os.remove(os.path.join("models", "dummy-opt.onnx")) os.remove("augmented_model.onnx") sess = rt.InferenceSession(quantized_model_path, None) input = np.random.uniform(-1, 1, sess.get_inputs()[0].shape).astype("float32") output = sess.run([sess.get_outputs()[0].name], {sess.get_inputs()[0].name: input})[0] print(name + " input has sizes", input.shape) input_files = os.path.join("data", "input_" + name) np.save(input_files, input.data) print(name + " output has sizes", output.shape) output_files = os.path.join("data", "output_" + name) np.save(output_files, np.ascontiguousarray(output.data))
def test_save_as_external(self): data_reader = InputFeedsNegOneZeroOne( 10, {"input": [1, self._channel_size, 1, 3]}) for use_external_data_format in [True, False]: quant_model_path = str( Path(self._tmp_model_dir.name) / f"quant.{use_external_data_format}.onnx") quantize_static( self._model_fp32_path, quant_model_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, use_external_data_format=use_external_data_format, ) data_reader.rewind() check_model_correctness(self, self._model_fp32_path, quant_model_path, data_reader.get_next()) data_reader.rewind()