def test_dnnlowp_dequantize(self, size, gc, dc): min_ = -10.0 max_ = 20.0 X = (np.random.rand(size) * (max_ - min_) + min_).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_type_list = ["Dequantize", "Int8Dequantize"] engine = "DNNLOWP" outputs.append(Output(X, op_type="", engine="")) for op_type in op_type_list: net = core.Net("test_net") quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) dequantize = core.CreateOperator(op_type, ["X_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_gather(self, dim1, dim2, is_empty, in_quantized, out_quantized, gc, dc): if is_empty: dim2 = 0 # FIXME : DNNLOWP Gather doesn't support quantized input and # dequantized output if in_quantized: out_quantized = True data = (np.random.rand(dim1) * 2 - 1).astype(np.float32) index = np.floor(np.random.rand(dim2) * dim1).astype(np.int32) Output = collections.namedtuple("Output", ["out", "op_type", "engine"]) outputs = [] op_engine_list = [ ("Gather", ""), ("Gather", "DNNLOWP"), ("Int8Gather", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize_data = core.CreateOperator("Quantize", ["data"], ["data_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize_data]) gather = core.CreateOperator( op_type, ["data_q" if do_quantize else "data", "index"], ["out_q" if do_dequantize else "out"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) net.Proto().op.extend([gather]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["out_q"], ["out"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("data").feed(data, device_option=gc) self.ws.create_blob("index").feed(index, device_option=gc) self.ws.run(net) outputs.append( Output(out=self.ws.blobs["out"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs, ref=data)
def test_dnnlowp_relu(self, size, gc, dc): min_ = -10. max_ = 10. scale = (max_ - min_) / 255 zero_point = int(np.round(-min_ / scale)) X = (np.random.rand(size) * (max_ - min_) + min_).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("Relu", ""), ("Relu", "DNNLOWP"), ("Int8Relu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if engine == "DNNLOWP": quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc, Y_scale=scale, Y_zero_point=zero_point, ) net.Proto().op.extend([quantize]) relu = core.CreateOperator( op_type, ["X_q" if engine == "DNNLOWP" else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], engine=engine, device_option=gc, ) net.Proto().op.extend([relu]) if engine == "DNNLOWP": dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) # Y = max(0, X) so the only error is quantization of inputs check_quantized_results_close(outputs, ref=X)
def test_dnnlowp_elementwise_sum_int(self, N, M, gc, dc): # All inputs have scale 1, so exactly represented after quantization inputs = M * [None] X_names = M * [None] X_q_names = M * [None] for i in range(M): X = np.random.randint(-128, 127, N, np.int8).astype(np.float32) X[0] = -128 X[-1] = 127 inputs[i] = X X_names[i] = chr(ord("A") + i) X_q_names[i] = X_names[i] + "_q" Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("Sum", ""), ("Sum", "DNNLOWP"), ("Int8Sum", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") if engine == "DNNLOWP": for i in range(M): quantize = core.CreateOperator( "Quantize", X_names[i], X_q_names[i], engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) sum_ = core.CreateOperator( op_type, X_q_names if engine == "DNNLOWP" else X_names, ["Y_q" if engine == "DNNLOWP" else "Y"], engine=engine, device_option=gc, ) net.Proto().op.extend([sum_]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) for i in range(M): self.ws.create_blob(X_names[i]).feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_elementwise_add_broadcast_axis(self, gc, dc): for bdim, axis in [ ((3, 4), 1), # broadcasting intermediate dimensions ((2, ), 0), # broadcasting the first dimension ((1, 4, 1), 1) ]: # broadcasting with single elem dimensions at both ends min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_) A = A.astype(np.float32) B = np.round(np.random.rand(*bdim) * 255 / 2 - 64).astype( np.float32) A.flat[0] = min_ A.flat[1] = max_ B.flat[0] = -64 B.flat[1] = 127. / 2 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("Add", ""), ("Add", "DNNLOWP"), ("Int8Add", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") add = core.CreateOperator( op_type, ['A', 'B'], ['Y'], engine=engine, device_option=gc, broadcast=1, axis=axis, dequantize_output=1, ) net.Proto().op.extend([add]) self.ws.create_blob('A').feed(A, device_option=gc) self.ws.create_blob('B').feed(B, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_elementwise_mul_broadcast_axis(self, gc, dc): for bdim, axis in [ ((3, 4), 1), # broadcasting intermediate dimensions ((2, ), 0), # broadcasting the first dimension ((1, 4, 1), 1), ]: # broadcasting with single elem dimensions at both ends min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_) A = A.astype(np.float32) B = np.round(np.random.rand(*bdim) * 255 - 128).astype(np.float32) A.flat[0] = min_ A.flat[1] = max_ B.flat[0] = -128 B.flat[1] = 127 Output = collections.namedtuple("Output", ["Y", "engine"]) outputs = [] engine_list = ["", "DNNLOWP"] for engine in engine_list: net = core.Net("test_net") mul = core.CreateOperator( "Mul", ["A", "B"], ["Y"], engine=engine, device_option=gc, broadcast=1, axis=axis, dequantize_output=1, ) net.Proto().op.extend([mul]) self.ws.create_blob("A").feed(A, device_option=gc) self.ws.create_blob("B").feed(B, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_elementwise_add_broadcast(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_) A = A.astype(np.float32) A[0, 0, 0, 0] = min_ A[0, 0, 0, 1] = max_ B = np.round(np.random.rand(4, 5) * 255 / 2 - 64).astype(np.float32) B[0, 0] = -64 B[0, 1] = 127. / 2 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("Add", ""), ("Add", "DNNLOWP"), ("Int8Add", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") add = core.CreateOperator( op_type, ['A', 'B'], ['Y'], engine=engine, device_option=gc, broadcast=1, dequantize_output=1, ) net.Proto().op.extend([add]) self.ws.create_blob('A').feed(A, device_option=gc) self.ws.create_blob('B').feed(B, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_elementwise_mul_broadcast(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_) A = A.astype(np.float32) A[0, 0, 0, 0] = min_ A[0, 0, 0, 1] = max_ B = np.round(np.random.rand(4, 5) * 255 - 128).astype(np.float32) B[0, 0] = -128 B[0, 1] = 127 Output = collections.namedtuple("Output", ["Y", "engine"]) outputs = [] engine_list = ['', 'DNNLOWP'] for engine in engine_list: net = core.Net("test_net") mul = core.CreateOperator( "Mul", ['A', 'B'], ['Y'], engine=engine, device_option=gc, broadcast=1, dequantize_output=1, ) net.Proto().op.extend([mul]) self.ws.create_blob('A').feed(A, device_option=gc) self.ws.create_blob('B').feed(B, device_option=gc) self.ws.run(net) outputs.append(Output( Y=self.ws.blobs["Y"].fetch(), engine=engine)) check_quantized_results_close(outputs)
def test_groupwise_dnnlowp_conv_relu_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, True, # group-wise ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("ConvRelu", "DNNLOWP_16"), ("Int8ConvRelu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q", "W", "b"], ["Y_q"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) else: conv = core.CreateOperator( op_type, ["X", "W", "b"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, device_option=gc, ) net.Proto().op.extend([conv]) relu = core.CreateOperator( "Relu", ["Y"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([relu]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_groupwise_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, nbits_in_non_outlier, share_col_buffer, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group X_min = -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, nbits_in_non_outlier=nbits_in_non_outlier, engine=engine, group=group, quantize_groupwise=1, in_scale=x_q_param.scale, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs)
def _test_dnnlowp_nd_int( self, stride, pad, kernels, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): assume(group == 1 or dilation == 1) ndim = len(kernels) X, W, b = generate_convnd_inputs( (stride, ) * ndim, (pad, ) * ndim, kernels, (dilation, ) * ndim, (size, ) * ndim, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC" if fall_back_to_NCHW: X_nchw = nhwc2nchw(X) W_nchw = nhwc2nchw(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0 if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], strides=[stride] * ndim, kernels=kernels, dilations=[dilation] * ndim, pads=[pad] * (ndim * 2), order="NCHW" if fall_back_to_NCHW else order, dequantize_output=not do_dequantize, engine=engine, group=group, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = nchw2nhwc(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = ( np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( "DNNLOWP" in engine and weight_quantized and len(outputs) > 0 ) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_depthwise_3x3x3_conv( self, stride, size, group, batch_size, prepack_weight, fuse_relu, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_convnd_inputs( (stride, ) * 3, (pad, ) * 3, (kernel, ) * 3, (dilation, ) * 3, (size, ) * 3, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op = "ConvRelu" if fuse_relu else "Conv" op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") # TODO: no fall back to NCHW fall_back_to_NCHW = "DNNLOWP" not in engine if fall_back_to_NCHW: X_nchw = nhwc2nchw(X) W_nchw = nhwc2nchw(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], strides=[stride] * 3, kernels=[kernel] * 3, dilations=[dilation] * 3, pads=[pad] * (3 * 2), order="NCHW" if fall_back_to_NCHW else order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_prepack_weight: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = nchw2nhwc(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_batch_matmul_int_constant_B( self, m, n, k, C_1, C_2, A_quantized, B_quantized, out_quantized, gc, dc ): batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2))) batch_dims_A = batch_dims[-C_1:] batch_dims_B = batch_dims[-C_2:] A = np.zeros(batch_dims_A + (m, k)).astype(np.float32) B = np.zeros(batch_dims_B + (n, k)).astype(np.float32) if np.prod(batch_dims) > 0: for index in np.ndindex(batch_dims_A): # When both input and output are float, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error # A_min = -77 if (A_quantized or out_quantized) else -77 + i A_min = -77 A_max = A_min + 255 A[index] = np.round(np.random.rand(m, k) * 255 + A_min) # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw # when multiplied with B_min and B_max A[index][:, 0] = A_min A[index][0, 1] = A_max i = 0 for index in np.ndindex(batch_dims_B): # When weight is quantized in a lazy manner, each input of the batch has # scale 1 but with different offset, so input-wise quantization # shouldn't have any input quantization error when weight is quantized # in a lazy manner. B_min = -100 if B_quantized else -100 + i # B_min = -100 B_max = B_min + 255 B[index] = np.round(np.random.rand(n, k) * 255 + B_min) B[index][0, 0] = B_min B[index][1, 0] = B_max if C_1 > C_2: # A has more dims for outer_index in np.ndindex(batch_dims_A[: C_1 - C_2]): avoid_vpmaddubsw_overflow_fc( m, k, n, A[outer_index] if C_2 == 0 else A[outer_index + index], A_min, A_max, B[index], B_min, B_max, ) else: avoid_vpmaddubsw_overflow_fc( m, k, n, A[index[-C_1:]], A_min, A_max, B[index], B_min, B_max ) i += 1 for trans_a, trans_b in product([0, 1], [0, 1]): Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("BatchMatMul", ""), ("BatchMatMul", "DNNLOWP"), ("Int8BatchMatMul", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize_A = "DNNLOWP" in engine and A_quantized do_quantize_B = "DNNLOWP" in engine and B_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize_A: quantize_A = core.CreateOperator( "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_A]) if do_quantize_B: int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill( B if trans_b else B.swapaxes(-1, -2), "B_q" ) net.Proto().op.extend([int8_given_tensor_fill]) batch_matmul = core.CreateOperator( op_type, ["A_q" if do_quantize_A else "A", "B_q" if do_quantize_B else "B"], ["Y_q" if do_dequantize else "Y"], trans_a=trans_a, trans_b=trans_b, broadcast=True, constant_B=True, dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_B: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( batch_matmul, outputs[0][0] ) net.Proto().op.extend([batch_matmul]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed( A.swapaxes(-1, -2) if trans_a else A, device_option=gc ) self.ws.create_blob("B").feed( B if trans_b else B.swapaxes(-1, -2), device_option=gc ) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine) ) if np.prod(batch_dims) > 0: check_quantized_results_close(outputs)
def test_dnnlowp_elementwise_mul_int(self, N, in_quantized, out_quantized, in_place, gc, dc): # FIXME: DNNLOWP Mul doesn't support inplace operation and # dequantize_output=1 at the same time if in_place[0] or in_place[1]: in_quantized = True out_quantized = True # All inputs have scale 1, so exactly represented after quantization min_ = -100 max_ = min_ + 255 A = np.round(np.random.rand(N) * (max_ - min_) + min_) A = A.astype(np.float32) A[0] = min_ A[1] = max_ B = np.round(np.random.rand(N) * 255 - 128).astype(np.float32) B[0] = -128 B[1] = 127 Output = collections.namedtuple("Output", ["Y", "engine"]) outputs = [] engine_list = ['', 'DNNLOWP'] for engine in engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize_A = core.CreateOperator( "Quantize", ['A'], ['A_q'], engine=engine, device_option=gc, ) net.Proto().op.extend([quantize_A]) quantize_B = core.CreateOperator( "Quantize", ['B'], ['B_q'], engine=engine, device_option=gc, ) net.Proto().op.extend([quantize_B]) out = 'Y' if in_place[0]: out = 'A' elif in_place[1]: out = 'B' mul = core.CreateOperator( "Mul", ['A_q', 'B_q'] if do_quantize else ['A', 'B'], [(out + '_q') if do_dequantize else out], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) net.Proto().op.extend([mul]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", [out + '_q'], [out], engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob('A').feed(A, device_option=gc) self.ws.create_blob('B').feed(B, device_option=gc) self.ws.run(net) outputs.append(Output(Y=self.ws.blobs[out].fetch(), engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_spatial_bn_int( self, size, input_channels, output_channels, batch_size, order, in_quantized, out_quantized, gc, dc, ): X_min = -77 X_max = X_min + 255 X = np.round(np.random.rand(batch_size, size, size, input_channels)).astype(np.float32) X[0, 0, 0, 0] = X_min X[0, 0, 0, 1] = X_max epsilon = np.abs(np.random.rand()) scale = np.random.rand(input_channels).astype(np.float32) bias = np.random.rand(input_channels).astype(np.float32) mean = np.random.rand(input_channels).astype(np.float32) var = np.random.rand(input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("SpatialBN", ""), ("SpatialBN", "DNNLOWP"), ("Int8SpatialBN", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine) net.Proto().op.extend([quantize]) bn = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "scale", "bias", "mean", "var" ], ["Y_q" if do_dequantize else "Y"], is_test=True, epsilon=epsilon, order=order, engine=engine, dequantize_output=not do_dequantize, ) net.Proto().op.extend([bn]) if "DNNLOWP" in engine: dnnlowp_utils.add_quantization_param_args(bn, outputs[0][0]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("scale").feed(scale, device_option=gc) self.ws.create_blob("bias").feed(bias, device_option=gc) self.ws.create_blob("mean").feed(mean, device_option=gc) self.ws.create_blob("var").feed(var, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, fuse_relu, output_packed_bias, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min ) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min if batch_size != 0: X[0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min ) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("FC", "")] if fuse_relu: op_engine_list += [("Int8FCRelu", "DNNLOWP")] else: op_engine_list += [ ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity ) w_q_param = None if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed", "B_q32"] if do_dequantize and output_packed_bias else ["W_packed"], preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( fc, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([fc]) if fuse_relu and "DNNLOWP" not in engine: net.Relu(["Y"], "Y") if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, None, gc, outputs ) if output_packed_bias and do_prepack_weight and do_dequantize: bias_int32 = self.ws.blobs["B_q32"].fetch() if do_quantize_weight: np.testing.assert_equal(bias_int32[0], np.round(b / (x_q_param.scale * w_q_param.scale))) np.testing.assert_equal(bias_int32[0].dtype, np.int32) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_batch_matmul_int(self, m, n, k, batch_size, gc, dc): # A and B have scale 1, so exactly represented after quantization A_min = -77 A_max = A_min + 255 A = np.round(np.random.rand(batch_size, m, k) * 255 + A_min) A = A.astype(np.float32) # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw # when multiplied with B_min and B_max A[0, :, 0] = A_min A[0, 0, 1] = A_max B_min = -100 B_max = B_min + 255 B = np.round(np.random.rand(batch_size, n, k) * 255 + B_min) B = B.astype(np.float32) B[0, 0, 0] = B_min B[0, 1, 0] = B_max for i in range(batch_size): avoid_vpmaddubsw_overflow_fc( m, k, n, A[i,], A_min, A_max, B[i,], B_min, B_max ) for trans_a, trans_b in product([0, 1], [0, 1]): Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("BatchMatMul", ""), ("BatchMatMul", "DNNLOWP"), ("BatchMatMul", "DNNLOWP_16"), ("Int8BatchMatMul", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize_A = core.CreateOperator( "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_A]) quantize_B = core.CreateOperator( "Quantize", ["B"], ["B_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize_B]) batch_matmul = core.CreateOperator( op_type, [ "A_q" if "DNNLOWP" in engine else "A", "B_q" if "DNNLOWP" in engine else "B", ], ["Y_q" if "DNNLOWP" in engine else "Y"], trans_a=trans_a, trans_b=trans_b, engine=engine, device_option=gc, ) net.Proto().op.extend([batch_matmul]) if "DNNLOWP" in engine: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("A").feed( np.transpose(A, (0, 2, 1)) if trans_a else A, device_option=gc ) self.ws.create_blob("B").feed( B if trans_b else np.transpose(B, (0, 2, 1)), device_option=gc ) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine) ) check_quantized_results_close(outputs)
def test_dnnlowp_depthwise_3x3_conv( self, stride, size, group, batch_size, prepack_weight, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, relu, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] if relu: op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("Int8ConvRelu", "DNNLOWP"), ] else: op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_prepack_weight: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) elif relu: relu_op = core.CreateOperator("Relu", ["Y"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([relu_op]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_rowwise_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, gc, dc, ): print("@given M ", batch_size, " K ", input_channels, " N ", output_channels) print("@given in_quantized ", in_quantized, " out_quantized ", out_quantized) # X has scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0:2] = X_min X[0, 2] = X_max # Each row of W has scale 1 but with different offset, so row-wise # quantization shouldn't have any input quantization error. W = np.zeros((output_channels, input_channels)) W = W.astype(np.float32) for i in range(output_channels): W_min = -100 + i W_max = W_min + 255 W[i, :] = np.round( np.random.rand(input_channels) * (W_max - W_min) + W_min) W[i, 0] = W_min W[i, 1] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, 1, X, X_min, X_max, W[i:i + 1, ], W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP_ROWWISE"), ("FC", "DNNLOWP_ROWWISE_16"), ("Int8FC", "DNNLOWP_ROWWISE"), ("Int8FCRowWise", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) fc = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, prepack_weight, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) else: X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = ( np.random.rand( output_channels, kernel, kernel, input_channels_per_group ) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = "DNNLOWP" in engine and weight_quantized do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, nbits_in_non_outlier=nbits_in_non_outlier, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_group_norm(self, N, G, K, H, W, order, in_quantized, out_quantized, weight_quantized, gc, dc): C = G * K X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0 if order == "NHWC": X = np.transpose(X, [0, 2, 3, 1]) gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0 beta = np.random.randn(C).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("GroupNorm", ""), ("GroupNorm", "DNNLOWP"), ("Int8GroupNorm", "DNNLOWP")] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, gamma_q_param = ( dnnlowp_utils.create_int8_given_tensor_fill( gamma, "gamma_q")) net.Proto().op.extend([int8_given_tensor_fill]) X_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = ( dnnlowp_utils.create_int8_bias_tensor_fill( beta, "beta_q", X_q_param, gamma_q_param)) net.Proto().op.extend([int8_bias_tensor_fill]) group_norm = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "gamma_q" if do_quantize_weight else "gamma", "beta_q" if do_quantize_weight else "beta" ], ["Y_q" if do_dequantize else "Y"], dequantize_output=0 if do_dequantize else 1, group=G, order=order, is_test=True, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( group_norm, outputs[0][0]) net.Proto().op.extend([group_norm]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("gamma").feed(gamma, device_option=gc) self.ws.create_blob("beta").feed(beta, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs, atol_scale=2.0)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min X[0, 1] = X_max W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_groupwise_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, nbits_in_non_outlier, share_col_buffer, gc, dc, ): if group > 1: dilation = 1 assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group if nbits_in_non_outlier == 0: X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, True, # group-wise ) else: X_min = -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max W_min = -100 W_max = W_min + 255 W = ( np.random.rand( output_channels, kernel, kernel, input_channels_per_group ) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max W[ g * output_channels_per_group : (g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = nhwc2nchw(X) W = nhwc2nchw(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = (engine == "DNNLOWP" and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max()) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_average_pool( self, ndim, stride, pad, kernel, size, input_channels, batch_size, order, in_quantized, gc, dc, ): kernel = 2 # Only kernel size 2 is supported assume(kernel <= size) assume(pad < kernel) C = input_channels N = batch_size strides = (stride, ) * ndim pads = (pad, ) * (ndim * 2) kernels = (kernel, ) * ndim sizes = (size, ) * ndim # X has scale 1, so no input quantization error min_ = -100 max_ = min_ + 255 if order == "NCHW": X = np.round( np.random.rand(*((N, C) + sizes)) * (max_ - min_) + min_) X = X.astype(np.float32) X[(0, ) * (ndim + 2)] = min_ X[(0, ) * (ndim + 1) + (1, )] = max_ elif order == "NHWC": X = np.round( np.random.rand(*((N, ) + sizes + (C, ))) * (max_ - min_) + min_) X = X.astype(np.float32) X[(0, ) * (ndim + 2)] = min_ X[( 0, 1, ) + (0, ) * ndim] = max_ Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("AveragePool", ""), ("AveragePool", "DNNLOWP"), ("Int8AveragePool", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) max_pool = core.CreateOperator( op_type, ["X_q" if do_quantize else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], strides=strides, kernels=kernels, pads=pads, order=order, engine=engine, device_option=gc, ) net.Proto().op.extend([max_pool]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)
def test_groupwise_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max if not preserve_weight_sparsity: W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_max_pool( self, stride, pad, kernel, size, input_channels, batch_size, order, in_quantized, gc, dc, ): assume(kernel <= size) assume(pad < kernel) C = input_channels N = batch_size H = W = size min_ = -10 max_ = 20 if order == "NCHW": X = np.round(np.random.rand(N, C, H, W) * (max_ - min_) + min_) elif order == "NHWC": X = np.round(np.random.rand(N, H, W, C) * (max_ - min_) + min_) X = X.astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("MaxPool", ""), ("MaxPool", "DNNLOWP"), ("Int8MaxPool", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) max_pool = core.CreateOperator( op_type, ["X_q" if do_quantize else "X"], ["Y_q" if engine == "DNNLOWP" else "Y"], stride=stride, kernel=kernel, pad=pad, order=order, engine=engine, device_option=gc, ) net.Proto().op.extend([max_pool]) if engine == "DNNLOWP": dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.run(net) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)) # Y_i = max(X_j) so the only error is in quantization of inputs check_quantized_results_close(outputs, ref=X)
def test_groupwise_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=True, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: x_q_param = hardcode_scale_zp.choose_quantization_params( X.min(), X.max() ) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, quantize_groupwise=1, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_concat_int(self, dim1, dim2, in_quantized, out_quantized, gc, dc): # X has scale 1, so exactly represented after quantization min_ = -100 max_ = min_ + 255 X = np.round(np.random.rand(dim1, dim2) * (max_ - min_) + min_) X = X.astype(np.float32) X[0, 0] = min_ X[0, 1] = max_ # Y has scale 1/2, so exactly represented after quantization Y = np.round(np.random.rand(dim1, dim2) * 255 / 2 - 64) Y = Y.astype(np.float32) Y[0, 0] = -64 Y[0, 1] = 127. / 2 Output = collections.namedtuple("Output", ["Z", "op_type", "engine"]) outputs = [] op_engine_list = [ ("Concat", ""), ("Concat", "DNNLOWP"), ("Int8Concat", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize_x = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc, ) quantize_y = core.CreateOperator( "Quantize", ["Y"], ["Y_q"], engine=engine, device_option=gc, ) net.Proto().op.extend([quantize_x, quantize_y]) concat = core.CreateOperator( op_type, ["X_q", "Y_q"] if do_quantize else ["X", "Y"], ["Z_q" if do_dequantize else "Z", "split"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, axis=0, ) net.Proto().op.extend([concat]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Z_q"], ["Z"], engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("Y").feed(Y, device_option=gc) self.ws.create_blob("split") self.ws.run(net) outputs.append( Output(Z=self.ws.blobs["Z"].fetch(), op_type=op_type, engine=engine)) check_quantized_results_close(outputs)