def test_dnnlowp_group_norm(self, N, G, K, H, W, order, in_quantized, out_quantized, weight_quantized, gc, dc): C = G * K X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0 if order == "NHWC": X = np.transpose(X, [0, 2, 3, 1]) gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0 beta = np.random.randn(C).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("GroupNorm", ""), ("GroupNorm", "DNNLOWP"), ("Int8GroupNorm", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, gamma_q_param = ( dnnlowp_utils.create_int8_given_tensor_fill( gamma, "gamma_q")) net.Proto().op.extend([int8_given_tensor_fill]) X_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = ( dnnlowp_utils.create_int8_bias_tensor_fill( beta, "beta_q", X_q_param, gamma_q_param)) net.Proto().op.extend([int8_bias_tensor_fill]) group_norm = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "gamma_q" if do_quantize_weight else "gamma", "beta_q" if do_quantize_weight else "beta" ], ["Y_q" if do_dequantize else "Y"], dequantize_output=0 if do_dequantize else 1, group=G, order=order, is_test=True, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( group_norm, outputs[0][0]) net.Proto().op.extend([group_norm]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("gamma").feed(gamma, device_option=gc) self.ws.create_blob("beta").feed(beta, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs, atol_scale=2.0)
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): if group > 1: dilation = 1 assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) else: X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = "DNNLOWP" in engine and weight_quantized if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): if group > 1: dilation = 1 assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ("DNNLOWP" in engine and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_groupwise_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): if group > 1: dilation = 1 X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=True, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min X[0, 1] = X_max W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_conv3d_int( self, stride, pad, temporal_kernels, spatial_kernels, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): if group > 1: dilation = 1 kernels = (temporal_kernels,) + (spatial_kernels,) * 2 ndim = len(kernels) X, W, b = generate_convnd_inputs( (stride,) * ndim, (pad,) * ndim, kernels, (dilation,) * ndim, (size,) * ndim, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC" if fall_back_to_NCHW: X_nchw = nhwc2nchw(X) W_nchw = nhwc2nchw(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0 if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q" ) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max() ) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], strides=[stride] * ndim, kernels=kernels, dilations=[dilation] * ndim, pads=[pad] * (ndim * 2), order="NCHW" if fall_back_to_NCHW else order, dequantize_output=not do_dequantize, engine=engine, group=group, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed( X_nchw if fall_back_to_NCHW else X, device_option=gc ) self.ws.create_blob("W").feed( W_nchw if fall_back_to_NCHW else W, device_option=gc ) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = nchw2nhwc(Y) outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): if group > 1: dilation = 1 X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max() ) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_depthwise_3x3_conv( self, stride, size, group, batch_size, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, relu, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] if relu: op_engine_list = [("Conv", ""), ("ConvRelu", "DNNLOWP"), ("Int8ConvRelu", "DNNLOWP")] else: op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine preserve_activation_sparsity_int = 1 if preserve_activation_sparsity else 0 preserve_weight_sparsity_int = 1 if preserve_weight_sparsity else 0 if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity= preserve_activation_sparsity_int, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity_int, preserve_weight_sparsity=preserve_weight_sparsity_int, engine=engine, group=group, device_option=gc, ) if do_dequantize: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) elif relu: relu_op = core.CreateOperator("Relu", ["Y"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([relu_op]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_depthwise_3x3x3_conv( self, stride, size, group, batch_size, fuse_relu, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_convnd_inputs( (stride, ) * 3, (pad, ) * 3, (kernel, ) * 3, (dilation, ) * 3, (size, ) * 3, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op = "ConvRelu" if fuse_relu else "Conv" op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine if fall_back_to_NCHW: X_nchw = nhwc2nchw(X) W_nchw = nhwc2nchw(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine preserve_activation_sparsity_int = 1 if preserve_activation_sparsity else 0 preserve_weight_sparsity_int = 1 if preserve_weight_sparsity else 0 if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity= preserve_activation_sparsity_int, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], strides=[stride] * 3, kernels=[kernel] * 3, dilations=[dilation] * 3, pads=[pad] * (3 * 2), order="NCHW" if fall_back_to_NCHW else order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity_int, preserve_weight_sparsity=preserve_weight_sparsity_int, engine=engine, group=group, device_option=gc, ) if do_dequantize: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = nchw2nhwc(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_groupwise_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): if group > 1: dilation = 1 assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = ( np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max if not preserve_weight_sparsity: W[ g * output_channels_per_group : (g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_groupwise_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, nbits_in_non_outlier, share_col_buffer, gc, dc, ): if group > 1: dilation = 1 assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, True, # group-wise ) else: X_min = -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max W_min = -100 W_max = W_min + 255 W = ( np.random.rand( output_channels, kernel, kernel, input_channels_per_group ) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max W[ g * output_channels_per_group : (g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_batch_matmul_int_constant_B(self, m, n, k, C_1, C_2, A_quantized, B_quantized, out_quantized, gc, dc): batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2))) batch_dims_A = batch_dims[-C_1:] batch_dims_B = batch_dims[-C_2:] A = np.zeros(batch_dims_A + (m, k)).astype(np.float32) B = np.zeros(batch_dims_B + (n, k)).astype(np.float32) if np.prod(batch_dims) > 0: for index in np.ndindex(batch_dims_A): # When both input and output are float, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error # A_min = -77 if (A_quantized or out_quantized) else -77 + i A_min = -77 A_max = A_min + 255 A[index] = np.round(np.random.rand(m, k) * 255 + A_min) # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw # when multiplied with B_min and B_max A[index][:, 0] = A_min A[index][0, 1] = A_max i = 0 for index in np.ndindex(batch_dims_B): # When weight is quantized in a lazy manner, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error when weight is quantized # in a lazy manner. B_min = -100 if B_quantized else -100 + i # B_min = -100 B_max = B_min + 255 B[index] = np.round(np.random.rand(n, k) * 255 + B_min) B[index][0, 0] = B_min B[index][1, 0] = B_max if C_1 > C_2: # A has more dims for outer_index in np.ndindex(batch_dims_A[:C_1 - C_2]): avoid_vpmaddubsw_overflow_fc( m, k, n, A[outer_index] if C_2 == 0 else A[outer_index + index], A_min, A_max, B[index], B_min, B_max, ) else: avoid_vpmaddubsw_overflow_fc(m, k, n, A[index[-C_1:]], A_min, A_max, B[index], B_min, B_max) i += 1 for trans_a, trans_b in product([0, 1], [0, 1]): Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("BatchMatMul", ""), ("BatchMatMul", "DNNLOWP"), ("Int8BatchMatMul", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize_A = "DNNLOWP" in engine and A_quantized do_quantize_B = "DNNLOWP" in engine and B_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize_A: quantize_A = core.CreateOperator("Quantize", ["A"], ["A_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize_A]) if do_quantize_B: int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill( B if trans_b else B.swapaxes(-1, -2), "B_q") net.Proto().op.extend([int8_given_tensor_fill]) batch_matmul = core.CreateOperator( op_type, [ "A_q" if do_quantize_A else "A", "B_q" if do_quantize_B else "B" ], ["Y_q" if do_dequantize else "Y"], trans_a=trans_a, trans_b=trans_b, broadcast=True, constant_B=True, dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_B: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( batch_matmul, outputs[0][0]) net.Proto().op.extend([batch_matmul]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed( A.swapaxes(-1, -2) if trans_a else A, device_option=gc) self.ws.create_blob("B").feed( B if trans_b else B.swapaxes(-1, -2), device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) if np.prod(batch_dims) > 0: check_quantized_results_close(outputs)