def test_qtensor_view(self): scale, zero_point, dtype = 1.0, 2, torch.uint8 for device in get_supported_device_types(): q_int = torch.randint(0, 100, [1, 2, 3], device=device, dtype=dtype) q = torch._make_per_tensor_quantized_tensor(q_int, scale=scale, zero_point=zero_point) q2 = q.view(1, 3, 2) self.assertEqual(q.numel(), q2.numel()) # testing -1 self.assertEqual(q, q2.view(1, -1, 3)) a_int = torch.randint(0, 100, [1, 2, 3, 4], device=device, dtype=dtype) a = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point) b = a.transpose(1, 2) # swaps 2nd and 3rd dimension c = a.view(1, 3, 2, 4) # does not change tensor layout in memory self.assertEqual(b.size(), c.size()) self.assertEqual(b.q_scale(), c.q_scale()) self.assertEqual(b.q_zero_point(), c.q_zero_point()) self.assertNotEqual(b.stride(), c.stride()) # size is the same but the underlying data is different self.assertNotEqual(b.int_repr(), c.int_repr()) # torch.equal is not supported for the cuda backend if device == 'cpu': self.assertFalse(torch.equal(b, c)) else: self.assertRaises(RuntimeError, lambda: torch.equal(b, c)) # a case can't view non-contiguos Tensor a_int = torch.randint(0, 100, [1, 2, 3, 4], device=device, dtype=dtype) a = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point) b = a.transpose(1, 2) # swaps 2nd and 3rd dimension err_str = "view size is not compatible with input tensor's size and stride*" with self.assertRaisesRegex(RuntimeError, err_str): b.view(1, 4, 2, 3) # view on contiguous tensor is fine b.contiguous().view(1, 4, 2, 3)
def test_qtensor_resize(self): scale, zero_point, dtype = 1.0, 2, torch.uint8 sizes1 = [1, 2, 3, 4] sizes2 = [1 * 2, 3 * 4] sizes3 = [1, 2 * 3, 4] sizes4 = [1 * 2 * 3 * 4] sizes5 = [1, 2, 1, 3, 1, 4] q1_int = torch.randint(0, 100, sizes1, dtype=dtype) q1 = torch._make_per_tensor_quantized_tensor(q1_int, scale=scale, zero_point=zero_point) q2 = q1.resize(*sizes2) q3 = q2.resize(*sizes3) q4 = q3.resize(*sizes4) q5 = q4.resize(*sizes5) self.assertEqual(q1.numel(), q2.numel()) self.assertEqual(q1.numel(), q3.numel()) self.assertEqual(q1.numel(), q4.numel()) self.assertEqual(q1.numel(), q5.numel()) # Compare original and post-transpose a_int = torch.randint(0, 100, sizes1, dtype=dtype) a = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point) b = a.transpose(1, 2) # swaps 2nd and 3rd dimension c = b.resize(*sizes1) # Change the sizes back to the original self.assertEqual(a.size(), c.size()) self.assertEqual(b.q_scale(), c.q_scale()) self.assertEqual(b.q_zero_point(), c.q_zero_point()) self.assertNotEqual(b.stride(), c.stride()) # size is the same but the underlying data is different self.assertNotEqual(b.int_repr(), c.int_repr()) self.assertFalse(torch.equal(b, c)) # Throws an error if numel is wrong q1_int = torch.randint(0, 100, sizes1, dtype=dtype) q1 = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point) err_str = "requested resize to*" with self.assertRaisesRegex(RuntimeError, err_str): q2 = q1.resize(*sizes1[:-1]) # resize on both contiguous and non-contiguous tensor should be fine q3 = q1.resize(*sizes2) q4 = q1.contiguous().resize(*sizes2)
def test_qtensor_creation(self): scale = 0.5 zero_point = 10 val = 100 numel = 10 q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) self.assertEqual(scale, q.q_scale()) self.assertEqual(zero_point, q.q_zero_point()) # create Tensor from uint8_t Tensor, scale and zero_point int_tensor = torch.randint(0, 100, size=(10, ), dtype=torch.uint8) q = torch._make_per_tensor_quantized_tensor(int_tensor, scale, zero_point) self.assertEqual(int_tensor, q.int_repr()) self.assertEqual(scale, q.q_scale()) self.assertEqual(zero_point, q.q_zero_point()) # create via empty_like q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) q_el = torch.empty_like(q) self.assertEqual(q.q_scale(), q_el.q_scale()) self.assertEqual(q.q_zero_point(), q_el.q_zero_point()) self.assertEqual(q.dtype, q_el.dtype) # create via empty_like but change the dtype (currently not supported) with self.assertRaises(RuntimeError): torch.empty_like(q, dtype=torch.qint8)
def test_qtensor_copy(self): scale = 0.5 zero_point = 10 val = 100 numel = 10 # copy from same scale and zero_point q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) q2 = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) q.copy_(q2) self.assertEqual(q.int_repr(), q2.int_repr()) self.assertEqual(q.q_scale(), q2.q_scale()) self.assertEqual(q.q_zero_point(), q2.q_zero_point()) # copying from different scale and zero_point scale = 3.2 zero_point = 5 q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) # check original scale and zero_points are set correctly self.assertEqual(q.q_scale(), scale) self.assertEqual(q.q_zero_point(), zero_point) q.copy_(q2) # check scale and zero_points has been copied self.assertEqual(q, q2) # deep copy scale, zero_point, dtype = 1.0, 2, torch.uint8 q_int = torch.randint(0, 100, [3, 5], dtype=dtype) scale, zero_point = 2.0, 3 q = torch._make_per_tensor_quantized_tensor(q_int, scale=scale, zero_point=zero_point) qc = deepcopy(q) self.assertEqual(qc, q) # can't copy from quantized tensor to non-quantized tensor r = torch.empty([numel], dtype=torch.float) q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) with self.assertRaisesRegex(RuntimeError, "please use dequantize"): r.copy_(q)
def test_qtensor_dequantize_per_tensor(self): t = torch.arange(-10, 10, dtype=torch.int8) scale = 3 zero_point = 2 qt = torch._dequantize_per_tensor(t, scale, zero_point, torch.qint8) qt2 = torch._make_per_tensor_quantized_tensor(t, scale, zero_point) self.assertEqual(qt, qt2.dequantize())
def test_torch_qtensor_deepcopy(self): # cuda is not supported yet device = "cpu" q_int = torch.randint(0, 100, [3, 5], device=device, dtype=torch.uint8) scale, zero_point = 2.0, 3 q = torch._make_per_tensor_quantized_tensor(q_int, scale=scale, zero_point=zero_point) qc = deepcopy(q) self.assertEqual(qc, q)
def test_qtensor_reshape(self): scale, zero_point, dtype = 1.0, 2, torch.uint8 for device in get_supported_device_types(): q_int = torch.randint(0, 100, [3, 5], dtype=dtype, device=device) q = torch._make_per_tensor_quantized_tensor(q_int, scale=scale, zero_point=zero_point) q2 = q.reshape([15]) self.assertEqual(q.numel(), q2.numel()) self.assertEqual(q2.size(), [15]) # testing -1 self.assertEqual(q, q2.reshape([3, -1])) a_int = torch.randint(0, 100, [1, 2, 3, 4], dtype=dtype, device=device) a = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point) b = a.transpose(1, 2) # swaps 2nd and 3rd dimension c = a.reshape(1, 3, 2, 4) # does not change tensor layout self.assertEqual(b.size(), c.size()) self.assertEqual(b.q_scale(), c.q_scale()) self.assertEqual(b.q_zero_point(), c.q_zero_point()) self.assertNotEqual(b.stride(), c.stride()) self.assertNotEqual(b.int_repr(), c.int_repr()) # torch.equal is not supported for the cuda backend if device == 'cpu': self.assertFalse(torch.equal(b, c)) else: self.assertRaises(RuntimeError, lambda: torch.equal(b, c)) # we can use reshape for non-contiguous Tensor a_int = torch.randint(0, 100, [1, 2, 3, 4], dtype=dtype, device=device) a = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point) b = a.transpose(1, 2) # swaps 2nd and 3rd dimension c = b.reshape(1, 4, 2, 3)
def test_cuda_quantization_does_not_pin_memory(self): # Context - https://github.com/pytorch/pytorch/issues/41115 x = torch.randn(3) self.assertEqual(x.is_pinned(), False) q_int = torch.randint(0, 100, [1, 2, 3], device="cuda", dtype=torch.uint8) q = torch._make_per_tensor_quantized_tensor(q_int, scale=0.1, zero_point=0) x = torch.randn(3) self.assertEqual(x.is_pinned(), False)
def quantized_tensor_to_pytorch(tensor: torch.Tensor, scale, zp, num_bits, mode, dest_dtype, per_channel=False, channel_dim=0): """ Convert a tensor quantized with quantization parameters calculated by CACP to a PyTorch "native" quantized tensor. We refer to quantization parameters calculated using either of: * quantization.symmetric_linear_quantization_params * quantization.asymmetric_linear_quantization_params And to tensors quantized using either of: * quantization.linear_quantize * quantization.linear_quantize_clamp Args: tensor (torch.Tensor): The tensor quantized in CACP scale (torch.Tensor): Scale factor calcualted by CACP zp (torch.Tensor): Zero point calcualted by CACP num_bits (int): Number of bits used for quantization in CACP mode (quantization.LinearQuantMode): The quantization mode used in CACP dest_dtype (torch.dtype): PyTorch quantized dtype to convert to. Must be one of: torch.quint8, torch.qint8 per_channel (bool): Flag in indicating if tensor was quantized per-channel channel_dim (int): If per_channel is set, this indicates the dimension of the channel in the tensor Returns: PyTorch quantized tensor (dtype one of torch.quint8 / torch.qint8 / torch.qint32) """ assert ( tensor == tensor.int()).all(), 'Tensor does not appear to be quantized' converted_scale, converted_zp = qparams_to_pytorch(scale, zp, num_bits, mode, dest_dtype, reduce_range=False) zp_diff = -converted_zp.view(zp.shape) - zp if dest_dtype == torch.quint8: temp_dtype = torch.uint8 elif dest_dtype == torch.qint8: temp_dtype = torch.int8 else: # dest_dtype == torch.qint32: temp_dtype = torch.int32 tensor = (tensor - zp_diff).to(temp_dtype) if per_channel and scale.shape[channel_dim] > 1: return torch._make_per_channel_quantized_tensor( tensor, converted_scale, converted_zp, channel_dim) return torch._make_per_tensor_quantized_tensor(tensor, converted_scale, converted_zp)
def protobuf_tensor_deserializer( worker: AbstractWorker, protobuf_tensor: TensorDataPB) -> torch.Tensor: """"Strategy to deserialize a binary input using Protobuf""" size = tuple(protobuf_tensor.shape.dims) data = getattr(protobuf_tensor, "contents_" + protobuf_tensor.dtype) if protobuf_tensor.is_quantized: # Drop the 'q' from the beginning of the quantized dtype to get the int type dtype = TORCH_STR_DTYPE[protobuf_tensor.dtype[1:]] int_tensor = torch.tensor(data, dtype=dtype).reshape(size) # Automatically converts int types to quantized types return torch._make_per_tensor_quantized_tensor( int_tensor, protobuf_tensor.scale, protobuf_tensor.zero_point) else: dtype = TORCH_STR_DTYPE[protobuf_tensor.dtype] return torch.tensor(data, dtype=dtype).reshape(size)
def _clamp_weights(qweight, observer, scale, zp): if not _needs_weight_clamping(observer, qweight.dtype): return qweight observer = _get_weight_observer(observer) min_, max_ = observer.quant_min, observer.quant_max # Doing this because can't use torch.ops.quantized.clamp() with per_channel qscheme yet. qw_int_max = torch.clone(qweight.int_repr()).fill_(max_) qw_int_min = torch.clone(qweight.int_repr()).fill_(min_) qw_int = torch.minimum(torch.maximum(qweight.int_repr(), qw_int_min), qw_int_max) if observer.qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]: qweight = torch._make_per_tensor_quantized_tensor(qw_int, scale.item(), zp.item()) elif observer.qscheme in [torch.per_channel_symmetric, torch.per_channel_affine, torch.per_channel_affine_float_qparams]: qweight = torch._make_per_channel_quantized_tensor(qw_int, scale, zp, axis=observer.ch_axis) else: raise ValueError("Unexpected qscheme " + observer.qscheme) return qweight
def deserialize_tensor(protobuf_tensor: TorchTensor_PB) -> th.Tensor: """ This method converts a Protobuf torch tensor back into a Torch tensor. Args: protobuf_tensor (bin): Protobuf message of torch tensor. Returns: tensor (th.Tensor): a torch tensor converted from Protobuf """ tensor_id = get_protobuf_id(protobuf_tensor.id) tags = protobuf_tensor.tags description = protobuf_tensor.description contents_type = protobuf_tensor.WhichOneof("contents") tensor_data_pb = getattr(protobuf_tensor, contents_type) size = tuple(tensor_data_pb.shape.dims) data = getattr(tensor_data_pb, "contents_" + tensor_data_pb.dtype) if tensor_data_pb.is_quantized: # Drop the 'q' from the beginning of the quantized dtype to get the int type dtype = TORCH_STR_DTYPE[tensor_data_pb.dtype[1:]] int_tensor = th.tensor(data, dtype=dtype).reshape(size) # Automatically converts int types to quantized types tensor = th._make_per_tensor_quantized_tensor( int_tensor, tensor_data_pb.scale, tensor_data_pb.zero_point) else: dtype = TORCH_STR_DTYPE[tensor_data_pb.dtype] tensor = th.tensor(data, dtype=dtype).reshape(size) tensor.id = tensor_id tensor.tags = set(tags) tensor.description = description return tensor
def add_qconv2d(self, node, fuse_code): assert node.inputsSize() == 4 assert node.outputsSize() == 1 ( jit_image, jit_packed_weight, jit_scale, jit_zero_point, ) = node.inputs() _, out_scale = self.get_constant_value(jit_scale, "FloatType") _, out_zero_point = self.get_constant_value(jit_zero_point, "IntType") weight_ctype, packed_weight = self.get_constant_value( jit_packed_weight) assert weight_ctype.name() == "Conv2dPackedParamsBase" ( pack_version, tensors, opt_tensors, ) = packed_weight.__getstate__()[0] assert pack_version == "2" packed_config, raw_weight = tensors raw_bias, = opt_tensors assert raw_bias is not None args = self.get_conv_pool_args_2d_from_pack(raw_weight.shape[2:4], packed_config) assert raw_weight.qscheme() == torch.per_tensor_affine if raw_weight.dtype == torch.quint8: unsigned_weight = raw_weight else: assert raw_weight.dtype == torch.qint8 unsigned_weight = torch._make_per_tensor_quantized_tensor( (raw_weight.int_repr().int() + 128).to(torch.uint8), scale=raw_weight.q_scale(), zero_point=raw_weight.q_zero_point() + 128) weight_scale = unsigned_weight.q_scale() _, image_oper = self.get_tensor_operand_by_jitval(jit_image) bias_scale = image_oper.scale * weight_scale int_bias = torch.quantize_per_tensor(raw_bias, bias_scale, 0, torch.qint32) bias_id = self.add_tensor_operand_for_weight(int_bias) multiplier = image_oper.scale * weight_scale / out_scale assert multiplier > 0 if multiplier >= 1: raise Exception( "Quantized convolution multiplier is greater than 1. " "This is supported by NNAPI, but not by most hardware backends. " "Try training a model without quantization-aware training. ") return self.add_conv2d_common( node.outputsAt(0), out_scale, out_zero_point, jit_image, unsigned_weight, bias_id, args, False, # transpose fuse_code, )
def add_qlinear(self, node): assert node.inputsSize() == 4 assert node.outputsSize() == 1 ( jit_input, jit_packed_weight, jit_scale, jit_zero_point, ) = node.inputs() input_id, input_oper = self.get_tensor_operand_by_jitval(jit_input) # TODO: Support automatic reshape assert len(input_oper.shape) == 2 _, out_scale = self.get_constant_value(jit_scale, "FloatType") _, out_zero_point = self.get_constant_value(jit_zero_point, "IntType") weight_ctype, packed_weight = self.get_constant_value( jit_packed_weight) assert weight_ctype.name() == "LinearPackedParamsBase" raw_weight, raw_bias = packed_weight.__getstate__()[0] assert raw_bias is not None assert len(raw_weight.shape) == 2 assert len(raw_bias.shape) == 1 assert raw_bias.shape[0] == raw_weight.shape[0] assert raw_weight.shape[1] == input_oper.shape[1] assert raw_weight.qscheme() == torch.per_tensor_affine if raw_weight.dtype == torch.quint8: unsigned_weight = raw_weight else: assert raw_weight.dtype == torch.qint8 unsigned_weight = torch._make_per_tensor_quantized_tensor( (raw_weight.int_repr().int() + 128).to(torch.uint8), scale=raw_weight.q_scale(), zero_point=raw_weight.q_zero_point() + 128) weight_scale = unsigned_weight.q_scale() bias_scale = input_oper.scale * weight_scale int_bias = torch.quantize_per_tensor(raw_bias, bias_scale, 0, torch.qint32) bias_id = self.add_tensor_operand_for_weight(int_bias) multiplier = input_oper.scale * weight_scale / out_scale assert multiplier > 0 if multiplier >= 1: raise Exception( "Quantized convolution multiplier is greater than 1. " "This is supported by NNAPI, but not by most hardware backends. " "Try training a model without quantization-aware training. ") # TODO: Transform at load time to share weights with CPU model. nnapi_weight_tensor = unsigned_weight.contiguous() weight_id = self.add_tensor_operand_for_weight(nnapi_weight_tensor) weight_oper = self.operands[weight_id] out_shape = (input_oper.shape[0], weight_oper.shape[0]) out_oper = input_oper._replace( shape=out_shape, scale=out_scale, zero_point=out_zero_point, ) inputs = [None] * 4 inputs[0] = input_id inputs[1] = weight_id inputs[2] = bias_id inputs[3] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE) outputs = [None] * 1 outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper) self.add_operation(NNAPI_OperationCode.FULLY_CONNECTED, inputs, outputs)