def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min X[0, 1] = X_max W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, fuse_relu, output_packed_bias, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min ) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min if batch_size != 0: X[0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min ) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("FC", "")] if fuse_relu: op_engine_list += [("Int8FCRelu", "DNNLOWP")] else: op_engine_list += [ ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity ) w_q_param = None if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed", "B_q32"] if do_dequantize and output_packed_bias else ["W_packed"], preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( fc, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([fc]) if fuse_relu and "DNNLOWP" not in engine: net.Relu(["Y"], "Y") if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, None, gc, outputs ) if output_packed_bias and do_prepack_weight and do_dequantize: bias_int32 = self.ws.blobs["B_q32"].fetch() if do_quantize_weight: np.testing.assert_equal(bias_int32[0], np.round(b / (x_q_param.scale * w_q_param.scale))) np.testing.assert_equal(bias_int32[0].dtype, np.int32) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_rowwise_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, gc, dc, ): print("@given M ", batch_size, " K ", input_channels, " N ", output_channels) print("@given in_quantized ", in_quantized, " out_quantized ", out_quantized) # X has scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0:2] = X_min X[0, 2] = X_max # Each row of W has scale 1 but with different offset, so row-wise # quantization shouldn't have any input quantization error. W = np.zeros((output_channels, input_channels)) W = W.astype(np.float32) for i in range(output_channels): W_min = -100 + i W_max = W_min + 255 W[i, :] = np.round( np.random.rand(input_channels) * (W_max - W_min) + W_min) W[i, 0] = W_min W[i, 1] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, 1, X, X_min, X_max, W[i:i + 1, ], W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP_ROWWISE"), ("FC", "DNNLOWP_ROWWISE_16"), ("Int8FC", "DNNLOWP_ROWWISE"), ("Int8FCRowWise", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) fc = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_batch_matmul_int(self, m, n, k, batch_size, gc, dc): # A and B have scale 1, so exactly represented after quantization A_min = -77 A_max = A_min + 255 A = np.round(np.random.rand(batch_size, m, k) * 255 + A_min) A = A.astype(np.float32) # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw # when multiplied with B_min and B_max A[0, :, 0] = A_min A[0, 0, 1] = A_max B_min = -100 B_max = B_min + 255 B = np.round(np.random.rand(batch_size, n, k) * 255 + B_min) B = B.astype(np.float32) B[0, 0, 0] = B_min B[0, 1, 0] = B_max for i in range(batch_size): avoid_vpmaddubsw_overflow_fc( m, k, n, A[i,], A_min, A_max, B[i,], B_min, B_max ) for trans_a, trans_b in product([0, 1], [0, 1]): Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("BatchMatMul", ""), ("BatchMatMul", "DNNLOWP"), ("BatchMatMul", "DNNLOWP_16"), ("Int8BatchMatMul", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize_A = core.CreateOperator( "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_A]) quantize_B = core.CreateOperator( "Quantize", ["B"], ["B_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_B]) batch_matmul = core.CreateOperator( op_type, [ "A_q" if "DNNLOWP" in engine else "A", "B_q" if "DNNLOWP" in engine else "B", ], ["Y_q" if "DNNLOWP" in engine else "Y"], trans_a=trans_a, trans_b=trans_b, engine=engine, device_option=gc, ) net.Proto().op.extend([batch_matmul]) if "DNNLOWP" in engine: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed( np.transpose(A, (0, 2, 1)) if trans_a else A, device_option=gc ) self.ws.create_blob("B").feed( B if trans_b else np.transpose(B, (0, 2, 1)), device_option=gc ) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine) ) check_quantized_results_close(outputs)
def test_dnnlowp_batch_matmul_int_constant_B( self, m, n, k, C_1, C_2, A_quantized, B_quantized, out_quantized, gc, dc ): batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2))) batch_dims_A = batch_dims[-C_1:] batch_dims_B = batch_dims[-C_2:] A = np.zeros(batch_dims_A + (m, k)).astype(np.float32) B = np.zeros(batch_dims_B + (n, k)).astype(np.float32) if np.prod(batch_dims) > 0: for index in np.ndindex(batch_dims_A): # When both input and output are float, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error # A_min = -77 if (A_quantized or out_quantized) else -77 + i A_min = -77 A_max = A_min + 255 A[index] = np.round(np.random.rand(m, k) * 255 + A_min) # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw # when multiplied with B_min and B_max A[index][:, 0] = A_min A[index][0, 1] = A_max i = 0 for index in np.ndindex(batch_dims_B): # When weight is quantized in a lazy manner, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error when weight is quantized # in a lazy manner. B_min = -100 if B_quantized else -100 + i # B_min = -100 B_max = B_min + 255 B[index] = np.round(np.random.rand(n, k) * 255 + B_min) B[index][0, 0] = B_min B[index][1, 0] = B_max if C_1 > C_2: # A has more dims for outer_index in np.ndindex(batch_dims_A[: C_1 - C_2]): avoid_vpmaddubsw_overflow_fc( m, k, n, A[outer_index] if C_2 == 0 else A[outer_index + index], A_min, A_max, B[index], B_min, B_max, ) else: avoid_vpmaddubsw_overflow_fc( m, k, n, A[index[-C_1:]], A_min, A_max, B[index], B_min, B_max ) i += 1 for trans_a, trans_b in product([0, 1], [0, 1]): Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("BatchMatMul", ""), ("BatchMatMul", "DNNLOWP"), ("Int8BatchMatMul", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize_A = "DNNLOWP" in engine and A_quantized do_quantize_B = "DNNLOWP" in engine and B_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize_A: quantize_A = core.CreateOperator( "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_A]) if do_quantize_B: int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill( B if trans_b else B.swapaxes(-1, -2), "B_q" ) net.Proto().op.extend([int8_given_tensor_fill]) batch_matmul = core.CreateOperator( op_type, ["A_q" if do_quantize_A else "A", "B_q" if do_quantize_B else "B"], ["Y_q" if do_dequantize else "Y"], trans_a=trans_a, trans_b=trans_b, broadcast=True, constant_B=True, dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_B: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( batch_matmul, outputs[0][0] ) net.Proto().op.extend([batch_matmul]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed( A.swapaxes(-1, -2) if trans_a else A, device_option=gc ) self.ws.create_blob("B").feed( B if trans_b else B.swapaxes(-1, -2), device_option=gc ) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine) ) if np.prod(batch_dims) > 0: check_quantized_results_close(outputs)
def test_rowwise_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, prepack_weight, gc, dc, ): print("@given M ", batch_size, " K ", input_channels, " N ", output_channels) print("@given in_quantized ", in_quantized, " out_quantized ", out_quantized) # X has scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0:2] = X_min X[0, 2] = X_max # Each row of W has scale 1 but with different offset, so row-wise # quantization shouldn't have any input quantization error. W = np.zeros((output_channels, input_channels)) W = W.astype(np.float32) for i in range(output_channels): W_min = -100 + i W_max = W_min + 255 W[i, :] = np.round( np.random.rand(input_channels) * (W_max - W_min) + W_min) W[i, 0] = W_min W[i, 1] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, 1, X, X_min, X_max, W[i:i + 1, ], W_min, W_max, ) if i % 2 == 0: W[i, :] = (W[i, :] - W_min) * 2 + W_min b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP_ROWWISE"), ("FC", "DNNLOWP_ROWWISE_16"), ("Int8FC", "DNNLOWP_ROWWISE"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_prepack_weight = engine == "DNNLOWP_ROWWISE" and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max()) if do_prepack_weight: inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed"], in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_prepack_weight: # When pre-packed quantized weight is provided, we can't rescale # the output dynamically by looking at the range of output of # each batch, so here we provide the range of output observed # from fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)