def create_int8_bias_tensor_fill(tensor, out_blob_name, x_q_param, w_q_param): """ Similar to create_int8_given_tensor_fill, but for bias blobs to be stored as int32. """ scale = x_q_param.scale * w_q_param.scale quantized_tensor = np.around(tensor / scale).astype(np.int32) quantized_tensor.reshape(-1) op = core.CreateOperator("Int8GivenIntTensorFill", [], out_blob_name) op.arg.extend([ utils.MakeArgument("values", quantized_tensor), utils.MakeArgument("shape", quantized_tensor.shape), ]) q_param = hardcode_scale_zp.QuantizationParam(scale, 0) add_quantization_param_args_(op, q_param) return op
def choose_quantization_params(tensor_min, tensor_max, preserve_sparsity=False): if tensor_min < 0 and tensor_max > 0 and preserve_sparsity: symmetric_qmin = -(255 // 2 + 1) symmetric_qmax = 255 // 2 max_scale = max(abs(tensor_min / symmetric_qmin), abs(tensor_max / symmetric_qmax)) tensor_min = max_scale * symmetric_qmin tensor_max = max_scale * symmetric_qmax q_param = hardcode_scale_zp.choose_quantization_params( tensor_min, tensor_max) if tensor_min < 0 and tensor_max > 0 and preserve_sparsity: q_param = hardcode_scale_zp.QuantizationParam(q_param.scale, 128) return q_param
def add_quantization_param_args(op, tensor, preserve_sparsity=False): tensor_min = 0 if tensor.size == 0 else tensor.min() tensor_max = 0 if tensor.size == 0 else tensor.max() if tensor_min < 0 and tensor_max > 0 and preserve_sparsity: symmetric_qmin = -(255 // 2 + 1) symmetric_qmax = 255 // 2 max_scale = max(abs(tensor_min / symmetric_qmin), abs(tensor_max / symmetric_qmax)) tensor_min = max_scale * symmetric_qmin tensor_max = max_scale * symmetric_qmax q_param = hardcode_scale_zp.choose_quantization_params( tensor_min, tensor_max) if tensor_min < 0 and tensor_max > 0 and preserve_sparsity: q_param = hardcode_scale_zp.QuantizationParam(q_param.scale, 128) add_quantization_param_args_(op, q_param) return q_param