def _test_dnnlowp_nd_int( self, stride, pad, kernels, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): assume(group == 1 or dilation == 1) ndim = len(kernels) X, W, b = generate_convnd_inputs( (stride, ) * ndim, (pad, ) * ndim, kernels, (dilation, ) * ndim, (size, ) * ndim, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC" if fall_back_to_NCHW: X_nchw = nhwc2nchw(X) W_nchw = nhwc2nchw(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0 if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], strides=[stride] * ndim, kernels=kernels, dilations=[dilation] * ndim, pads=[pad] * (ndim * 2), order="NCHW" if fall_back_to_NCHW else order, dequantize_output=not do_dequantize, engine=engine, group=group, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = nchw2nhwc(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = ( np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( "DNNLOWP" in engine and weight_quantized and len(outputs) > 0 ) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min X[0, 1] = X_max W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, prepack_weight, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) else: X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = ( np.random.rand( output_channels, kernel, kernel, input_channels_per_group ) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = "DNNLOWP" in engine and weight_quantized do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, nbits_in_non_outlier=nbits_in_non_outlier, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_group_norm( self, N, G, K, H, W, order, in_quantized, out_quantized, weight_quantized, gc, dc, ): C = G * K X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0 if order == "NHWC": X = utils.NCHW2NHWC(X) gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0 beta = np.random.randn(C).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("GroupNorm", ""), ("GroupNorm", "DNNLOWP"), ("Int8GroupNorm", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, gamma_q_param = dnnlowp_utils.create_int8_given_tensor_fill( gamma, "gamma_q") net.Proto().op.extend([int8_given_tensor_fill]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() X_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( beta, "beta_q", X_q_param, gamma_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) group_norm = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "gamma_q" if do_quantize_weight else "gamma", "beta_q" if do_quantize_weight else "beta", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=0 if do_dequantize else 1, group=G, order=order, is_test=True, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( group_norm, outputs[0][0]) net.Proto().op.extend([group_norm]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("gamma").feed(gamma, device_option=gc) self.ws.create_blob("beta").feed(beta, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs, atol_scale=2.0)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, fuse_relu, output_packed_bias, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min ) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min if batch_size != 0: X[0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min ) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("FC", "")] if fuse_relu: op_engine_list += [("Int8FCRelu", "DNNLOWP")] else: op_engine_list += [ ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity ) w_q_param = None if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed", "B_q32"] if do_dequantize and output_packed_bias else ["W_packed"], preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( fc, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([fc]) if fuse_relu and "DNNLOWP" not in engine: net.Relu(["Y"], "Y") if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, None, gc, outputs ) if output_packed_bias and do_prepack_weight and do_dequantize: bias_int32 = self.ws.blobs["B_q32"].fetch() if do_quantize_weight: np.testing.assert_equal(bias_int32[0], np.round(b / (x_q_param.scale * w_q_param.scale))) np.testing.assert_equal(bias_int32[0].dtype, np.int32) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_batch_matmul_int_constant_B( self, m, n, k, C_1, C_2, A_quantized, B_quantized, out_quantized, gc, dc ): batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2))) batch_dims_A = batch_dims[-C_1:] batch_dims_B = batch_dims[-C_2:] A = np.zeros(batch_dims_A + (m, k)).astype(np.float32) B = np.zeros(batch_dims_B + (n, k)).astype(np.float32) if np.prod(batch_dims) > 0: for index in np.ndindex(batch_dims_A): # When both input and output are float, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error # A_min = -77 if (A_quantized or out_quantized) else -77 + i A_min = -77 A_max = A_min + 255 A[index] = np.round(np.random.rand(m, k) * 255 + A_min) # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw # when multiplied with B_min and B_max A[index][:, 0] = A_min A[index][0, 1] = A_max i = 0 for index in np.ndindex(batch_dims_B): # When weight is quantized in a lazy manner, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error when weight is quantized # in a lazy manner. B_min = -100 if B_quantized else -100 + i # B_min = -100 B_max = B_min + 255 B[index] = np.round(np.random.rand(n, k) * 255 + B_min) B[index][0, 0] = B_min B[index][1, 0] = B_max if C_1 > C_2: # A has more dims for outer_index in np.ndindex(batch_dims_A[: C_1 - C_2]): avoid_vpmaddubsw_overflow_fc( m, k, n, A[outer_index] if C_2 == 0 else A[outer_index + index], A_min, A_max, B[index], B_min, B_max, ) else: avoid_vpmaddubsw_overflow_fc( m, k, n, A[index[-C_1:]], A_min, A_max, B[index], B_min, B_max ) i += 1 for trans_a, trans_b in product([0, 1], [0, 1]): Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("BatchMatMul", ""), ("BatchMatMul", "DNNLOWP"), ("Int8BatchMatMul", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize_A = "DNNLOWP" in engine and A_quantized do_quantize_B = "DNNLOWP" in engine and B_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize_A: quantize_A = core.CreateOperator( "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_A]) if do_quantize_B: int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill( B if trans_b else B.swapaxes(-1, -2), "B_q" ) net.Proto().op.extend([int8_given_tensor_fill]) batch_matmul = core.CreateOperator( op_type, ["A_q" if do_quantize_A else "A", "B_q" if do_quantize_B else "B"], ["Y_q" if do_dequantize else "Y"], trans_a=trans_a, trans_b=trans_b, broadcast=True, constant_B=True, dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_B: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( batch_matmul, outputs[0][0] ) net.Proto().op.extend([batch_matmul]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed( A.swapaxes(-1, -2) if trans_a else A, device_option=gc ) self.ws.create_blob("B").feed( B if trans_b else B.swapaxes(-1, -2), device_option=gc ) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine) ) if np.prod(batch_dims) > 0: check_quantized_results_close(outputs)
def test_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, weight_quantized, prepack_weight, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, order, gc, outputs ) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)