def test_linear_quant_batchedmatmul_8bit(self): np.random.seed(1988) W = np.random.rand(32, 32) * 2.0 - 1 bias = np.random.rand(32) input_features = [("data", datatypes.Array(2, 32))] output_features = [("out", None)] builder = NeuralNetworkBuilder( input_features, output_features, disable_rank5_shape_mapping=True ) builder.add_batched_mat_mul( name="batched_matmul", input_names=["data"], output_name="out", weight_matrix_rows=32, weight_matrix_columns=32, W=W, bias=bias, ) mlmodel = MLModel(builder.spec) q_mlmodel = quantize_weights(mlmodel, 8) q_spec = q_mlmodel.get_spec() q_layer = q_spec.neuralNetwork.layers[0].batchedMatmul self.assertTrue(len(q_layer.weights.floatValue) == 0) self.assertTrue(len(q_layer.weights.rawValue) > 0) data = np.random.rand(2, 32) data_dict = {"data": data} out = q_mlmodel.predict(data_dict, useCPUOnly=True)["out"] expected_out = np.matmul(data, W) + bias self.assertTrue(out.shape == expected_out.shape) self.assertTrue(np.allclose(out.flatten(), expected_out.flatten(), atol=0.1))
def test_linear_quant_batchedmatmul_5bit(self): W = np.zeros((2, 3), dtype=np.uint8) W[0, :] = [31, 20, 11] W[1, :] = [1, 0, 8] quant_scale = np.reshape(np.array([10.0, 2.0, 3.0]), (1, 3)) quant_bias = np.reshape(np.array([-2.0, -10.0, 6.0]), (1, 3)) W_unquantized = np.broadcast_to(quant_scale, (2, 3)) * W + np.broadcast_to(quant_bias, (2, 3)) bias = np.array([1.0, 2.0, 3.0]) input_features = [('data', datatypes.Array(2, 2))] output_features = [('out', None)] builder = NeuralNetworkBuilder(input_features, output_features, disable_rank5_shape_mapping=True) builder.add_batched_mat_mul(name='batched_matmul', input_names=['data'], output_name='out', weight_matrix_rows=2, weight_matrix_columns=3, W=_convert_array_to_nbit_quantized_bytes(W.flatten(), 5).tobytes(), bias=bias, is_quantized_weight=True, quantization_type='linear', nbits=5, quant_scale=quant_scale.flatten(), quant_bias=quant_bias.flatten()) mlmodel = MLModel(builder.spec) data = np.zeros((2, 2), dtype=np.float32) data[0, :] = [5, 6] data[1, :] = [10, 12] data_dict = {'data': data} out = mlmodel.predict(data_dict, useCPUOnly=True)['out'] expected_out = np.matmul(data, W_unquantized) + bias self.assertTrue(out.shape == expected_out.shape) self.assertTrue(np.allclose(out.flatten(), expected_out.flatten()))