def test_fake_quant_quant_per_channel_other_prec(self): kernel_size = 3 quant_desc_input = QuantDescriptor(num_bits=4) quant_desc_weight = QuantDescriptor(num_bits=3, axis=(0)) quant_conv_object = quant_conv.QuantConv3d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=False, quant_desc_input=quant_desc_input, quant_desc_weight=quant_desc_weight) test_input = torch.randn(16, _NUM_IN_CHANNELS, 8, 8, 8) test_input_quantizer = TensorQuantizer(quant_desc_input) weight_quantizer = TensorQuantizer(quant_desc_weight) quant_input = test_input_quantizer(test_input) weight_copy = quant_conv_object.weight.clone() quant_weight = weight_quantizer(weight_copy) out1 = F.conv3d(quant_input, quant_weight) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_fake_quant_per_channel_other_precs(self): """Test some precisions other than 8bit.""" size_in = 255 size_out = 257 quant_desc_input = tensor_quant.QuantDescriptor(num_bits=4) quant_desc_weight = tensor_quant.QuantDescriptor(num_bits=3) quant_linear_object = quant_linear.QuantLinear( size_in, size_out, bias=False, quant_desc_input=quant_desc_input, quant_desc_weight=quant_desc_weight) weight_quantizer = TensorQuantizer(quant_desc_weight) test_input_quantizer = TensorQuantizer(quant_desc_input) test_input = torch.randn(32, size_in) weight_copy = quant_linear_object.weight.clone() quant_input = test_input_quantizer(test_input) quant_weight = weight_quantizer(weight_copy) out1 = F.linear(quant_input, quant_weight) out2 = quant_linear_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr( config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size # Quantized implementations of torch.nn.Linear modules self.query = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.key = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) # Additional quantizers that will be needed to quantize the inputs to the torch.matmul() operation in the # forward method. Since it's a simple operation and no quantized version of it exists, the inputs to this # operation could be manually quantized to realize a quantized mat-mul operation. self.matmul_q_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input) self.matmul_k_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input) self.matmul_v_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input) self.matmul_a_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input)
def test_print_tensor_quantizer(self): test_quantizer = TensorQuantizer() print(test_quantizer)