def test_compare_qat_qc_quantize_quarter_range_scaled_input(self): """ compare qat asymmetric quantization with qc quantize implementation :return: """ quantizer = libpymo.TensorQuantizer( libpymo.QuantizationMode.QUANTIZATION_TF, libpymo.RoundingMode.ROUND_NEAREST) np.random.seed(1) random_input = 10 * np.random.normal(size=[1, 3, 224, 224]) - 20 # 1/4 range min, max (no scaling input) x_min = min(0., 0.5 * float(random_input.min())) x_max = max(0., 0.5 * float(random_input.max())) x_min = min(x_min, 0) x_max = max(x_max, 0) # qc quantize self.set_quantizer_values(quantizer, x_min, x_max) # print(quantizer.encoding.min, quantizer.encoding.max, quantizer.encoding.delta, quantizer.encoding.offset) # aimet quantizer output_tensor = np.zeros((1, 3, 224, 224)).astype(np.float32) quantizer.quantizeDequantize(random_input, output_tensor, x_min, x_max, 8, False) # qat asymmenteic quantizer output as float32 x_quant = self.qat_python_asymmetric_quantizer( random_input, 8, x_max, x_min).astype(np.float32) # compare qc quantize output and qat asymmetric quantizer output self.assertTrue(np.allclose(x_quant, output_tensor))
def test_sanity(self): quantizer = libpymo.TensorQuantizer( libpymo.QuantizationMode.QUANTIZATION_TF_ENHANCED, libpymo.RoundingMode.ROUND_NEAREST) np.random.seed(10) random_input = np.random.randn(1, 3, 224, 224).astype(np.float32) self.assertFalse(quantizer.isEncodingValid) quantizer.updateStats(random_input, False) self.assertFalse(quantizer.isEncodingValid) encoding = quantizer.computeEncoding(8, False, False, False) print(quantizer.encoding.min, quantizer.encoding.max, quantizer.encoding.delta, quantizer.encoding.offset) self.assertTrue(quantizer.isEncodingValid) self.assertEqual(quantizer.quantScheme, libpymo.QuantizationMode.QUANTIZATION_TF_ENHANCED) self.assertEqual(quantizer.roundingMode, libpymo.RoundingMode.ROUND_NEAREST) input_tensor = np.random.randn(1, 3, 224, 224).astype(np.float32) output_tensor = np.zeros((1, 3, 224, 224)).astype(np.float32) quantizer.quantizeDequantize(input_tensor, output_tensor, encoding.min, encoding.max, 8, False) # Check that the output tensor did get updated self.assertFalse(np.all(output_tensor == 0)) # Check that the quantized tensor is close to the input tensor but not the same self.assertTrue(np.allclose(output_tensor, input_tensor, atol=0.2)) self.assertFalse(np.allclose(output_tensor, input_tensor, atol=0.1))
def __setstate__(self, state): self.session = None # Create the cpp tensor quantizer reference self.quant_op_name = state.quant_op_name self.quantizer_type = state.quantizer_type self.tensor_quantizer = libpymo.TensorQuantizer( state.quant_scheme, state.rounding_mode) self.tensor_quantizer.isEncodingValid = state.is_encoding_valid
def test_qc_quantize_recurrent_param_op(self): """ test custom recurrent param quantize op with CPU """ zero_out_module = tf.load_op_library('libaimet_tf_ops.so') graph = tf.Graph() config = tf.compat.v1.ConfigProto(log_device_placement=False) sess = tf.compat.v1.Session(graph=graph, config=config) bitwidth = 8 use_symm_encoding = True with graph.as_default(): # place holder for the input with tf.device("/device:CPU:0"): inp = tf.compat.v1.placeholder(tf.float32, shape=[10], name='input') tensor_quantizer = libpymo.TensorQuantizer( libpymo.QuantizationMode.QUANTIZATION_TF, libpymo.RoundingMode.ROUND_NEAREST) tensor_quantizer_val = libpymo.PtrToInt64(tensor_quantizer) tensor_quant_ref = tf.Variable( initial_value=tensor_quantizer_val, trainable=False, dtype=tf.int64) time_step_tensor = tf.constant(1, dtype=tf.int32) encoding_min = tf.Variable(initial_value=-0.5, trainable=True, dtype=tf.double) encoding_max = tf.Variable(initial_value=0.5, trainable=True, dtype=tf.double) bit_width = tf.Variable(initial_value=bitwidth, trainable=False, dtype=tf.int8) use_symmetric_encoding = tf.Variable( initial_value=use_symm_encoding, trainable=False, dtype=tf.bool) mode_var = tf.Variable(initial_value=int( libpymo.TensorQuantizerOpMode.oneShotQuantizeDequantize), trainable=False, dtype=tf.int32) sess.run([ mode_var.initializer, tensor_quant_ref.initializer, encoding_min.initializer, encoding_max.initializer, bit_width.initializer, use_symmetric_encoding.initializer ]) pass_through_op_output = zero_out_module.qc_quantize_recurrent_param( name='quant_op', in_tensor=inp, op_mode=mode_var, tensor_quantizer_reference=tensor_quant_ref, encoding_min=encoding_min, encoding_max=encoding_max, bit_width=bit_width, use_symmetric_encoding=use_symmetric_encoding, time_steps=time_step_tensor) inp_tensor = sess.graph.get_tensor_by_name('input:0') # inp_data = np.random.rand(10).astype(np.float32) np.random.seed(18) inp_data = np.random.randint(low=-1, high=2, size=10).astype(np.float32) # get the output print(inp_data) out_data = sess.run(pass_through_op_output, feed_dict={inp_tensor: inp_data}) print(out_data) # compare qc_quantize op's output with input # encodings being set to -0.5 and 0.5 should not have a bearing on this quantized output # we should not observe truncation if op's encoding min/max input values are used instead of cached values self.assertTrue(np.allclose(out_data, inp_data, atol=1e-6)) sess.close()
def test_qc_quantize_op_straight_through_gradient_computation(self): """ test to validate tensorflow quantize op straight through estimator gradient computation """ from aimet_tensorflow import quantsim_straight_through_grad zero_out_module = tf.load_op_library('libaimet_tf_ops.so') graph = tf.Graph() config = tf.compat.v1.ConfigProto(log_device_placement=False) sess = tf.compat.v1.Session(graph=graph, config=config) with graph.as_default(): inp = tf.compat.v1.placeholder(tf.float32, shape=[2, 2], name='input') tensor_quantizer = libpymo.TensorQuantizer( libpymo.QuantizationMode.QUANTIZATION_TF_ENHANCED, libpymo.RoundingMode.ROUND_NEAREST) tensor_quantizer_val = libpymo.PtrToInt64(tensor_quantizer) tensor_quant_ref = tf.Variable(initial_value=tensor_quantizer_val, trainable=False, dtype=tf.int64) mode_var = tf.Variable(initial_value=int( libpymo.TensorQuantizerOpMode.oneShotQuantizeDequantize), trainable=False, dtype=tf.int32) # fix min max and bitwidth to be used encoding_min = tf.Variable(initial_value=0.0, trainable=True, dtype=tf.double) encoding_max = tf.Variable(initial_value=5.0, trainable=True, dtype=tf.double) bit_width = tf.Variable(initial_value=8, trainable=False, dtype=tf.int8) use_symmetric_encoding = tf.Variable(initial_value=False, trainable=False, dtype=tf.bool) sess.run([ mode_var.initializer, tensor_quant_ref.initializer, encoding_min.initializer, encoding_max.initializer, bit_width.initializer, use_symmetric_encoding.initializer ]) # use default gradient pass_through_op_output = zero_out_module.qc_quantize( name='quant_op', in_tensor=inp, op_mode=mode_var, tensor_quantizer_reference=tensor_quant_ref, encoding_min=encoding_min, encoding_max=encoding_max, bit_width=bit_width, use_symmetric_encoding=use_symmetric_encoding) # pass_through_op = graph.get_operation_by_name('quant_op') inp_tensor = sess.graph.get_tensor_by_name('input:0') # set the encodings tensor_quantizer.isEncodingValid = True mode_var.load(int(libpymo.TensorQuantizerOpMode.quantizeDequantize), sess) # compute default gradient grads = tf.gradients(pass_through_op_output, [inp_tensor]) dlossbydx = grads # send input, note the last value sent here is > 5.0 , # we set encodings earlier to be min = 0.0 , max = 5.0 # input has data > p inp_data = [[1.4581, 0.4829], [0.3125, 5.6150]] # check the gradient returned is a gated version, in this case should be [[1.0, 1.0],[1.0, 0.0]] with graph.as_default(): input_gradient = sess.run([dlossbydx], feed_dict={inp_tensor: inp_data})[0] # validate valid clamping in gradient computation self.assertTrue(input_gradient[0][0][0] == 1.0) self.assertTrue(input_gradient[0][0][1] == 1.0) self.assertTrue(input_gradient[0][1][0] == 1.0) self.assertTrue(input_gradient[0][1][1] == 0.0) # pass input in correct range inp_data = [[1.4581, 0.4829], [0.3125, 1.6150]] # check the gradient returned is a gated version, in this case should be [[1.0, 1.0],[1.0, 0.0]] with graph.as_default(): input_gradient = sess.run([dlossbydx], feed_dict={inp_tensor: inp_data})[0] # validate no clamping case in gradient computation self.assertTrue(input_gradient[0][0][0] == 1.0) self.assertTrue(input_gradient[0][0][1] == 1.0) self.assertTrue(input_gradient[0][1][0] == 1.0) self.assertTrue(input_gradient[0][1][1] == 1.0) # pass input with data < n , first value here is -0.5 inp_data = [[-0.5, 0.4829], [0.3125, 1.6150]] # check the gradient returned is a gated version, in this case should be [[1.0, 1.0],[1.0, 0.0]] with graph.as_default(): input_gradient = sess.run([dlossbydx], feed_dict={inp_tensor: inp_data})[0] # validate valid clamping case in gradient computation self.assertTrue(input_gradient[0][0][0] == 0.0) self.assertTrue(input_gradient[0][0][1] == 1.0) self.assertTrue(input_gradient[0][1][0] == 1.0) self.assertTrue(input_gradient[0][1][1] == 1.0)
def test_qc_quantize_op_cpu(self): """ test custom op with CPU """ zero_out_module = tf.load_op_library('libaimet_tf_ops.so') graph = tf.Graph() config = tf.compat.v1.ConfigProto(log_device_placement=False) sess = tf.compat.v1.Session(graph=graph, config=config) bitwidth = 8 use_symm_encoding = True with graph.as_default(): # place holder for the input with tf.device("/device:CPU:0"): inp = tf.compat.v1.placeholder(tf.float32, shape=[10], name='input') tensor_quantizer = libpymo.TensorQuantizer( libpymo.QuantizationMode.QUANTIZATION_TF_ENHANCED, libpymo.RoundingMode.ROUND_NEAREST) tensor_quantizer_val = libpymo.PtrToInt64(tensor_quantizer) tensor_quant_ref = tf.Variable( initial_value=tensor_quantizer_val, trainable=False, dtype=tf.int64) encoding_min = tf.Variable(initial_value=0.0, trainable=True, dtype=tf.double) encoding_max = tf.Variable(initial_value=0.0, trainable=True, dtype=tf.double) bit_width = tf.Variable(initial_value=bitwidth, trainable=False, dtype=tf.int8) use_symmetric_encoding = tf.Variable( initial_value=use_symm_encoding, trainable=False, dtype=tf.bool) mode_var = tf.Variable(initial_value=int( libpymo.TensorQuantizerOpMode.updateStats), trainable=False, dtype=tf.int32) sess.run([ mode_var.initializer, tensor_quant_ref.initializer, encoding_min.initializer, encoding_max.initializer, bit_width.initializer, use_symmetric_encoding.initializer ]) pass_through_op_output = zero_out_module.qc_quantize( name='quant_op', in_tensor=inp, op_mode=mode_var, tensor_quantizer_reference=tensor_quant_ref, encoding_min=encoding_min, encoding_max=encoding_max, bit_width=bit_width, use_symmetric_encoding=use_symmetric_encoding) inp_tensor = sess.graph.get_tensor_by_name('input:0') inp_data = np.random.rand(10) # get the output print(inp_data) out_data = sess.run(pass_through_op_output, feed_dict={inp_tensor: inp_data}) print(out_data) # compare qc_quantize op's output with input self.assertTrue(np.allclose(out_data, inp_data)) # compute encodings self.assertFalse(tensor_quantizer.isEncodingValid) encoding = tensor_quantizer.computeEncoding(bitwidth, use_symm_encoding, False, False) self.assertTrue(tensor_quantizer.isEncodingValid) print('min=', encoding.min, ', max=', encoding.max) # get the output inp_data = np.random.rand(10) * 2 print(inp_data) mode_var.load(int(libpymo.TensorQuantizerOpMode.quantizeDequantize), sess) out_data = sess.run(pass_through_op_output, feed_dict={inp_tensor: inp_data}) print(out_data) # compare qc_quantize op's output with input self.assertFalse(np.allclose(out_data, inp_data)) sess.close()