def test_convolution_separate_stride_pad_layout( self, op_type, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel, size, input_channels, output_channels, batch_size, engine, use_bias, gc, dc, ): X = (np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5) w = (np.random.rand(output_channels, kernel, kernel, input_channels).astype(np.float32) - 0.5) b = np.random.rand(output_channels).astype(np.float32) - 0.5 outputs = {} for order in ["NCHW", "NHWC"]: op = core.CreateOperator( op_type, ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride_h=stride_h, stride_w=stride_w, kernel=kernel, pad_t=pad_t, pad_l=pad_l, pad_b=pad_b, pad_r=pad_r, order=order, engine=engine, device_option=gc, ) if order == "NCHW": X_f = utils.NHWC2NCHW(X) w_f = utils.NHWC2NCHW(w) else: X_f = X w_f = w self.ws.create_blob("X").feed(X_f, device_option=gc) self.ws.create_blob("w").feed(w_f, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(op) outputs[order] = self.ws.blobs["Y"].fetch() np.testing.assert_allclose(outputs["NCHW"], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4)
def test_conv_separate_stride_pad_gradients(self, stride_h, stride_w, pad_h, pad_w, kernel, size, input_channels, output_channels, batch_size, order, engine, shared_buffer, use_bias, deformable_group, gc, dc): op = core.CreateOperator( "DeformConv", ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"], ["Y"], stride_h=stride_h, stride_w=stride_w, pad_t=pad_h, pad_l=pad_w, pad_b=pad_h, pad_r=pad_w, kernel=kernel, order=order, engine=engine, shared_buffer=int(shared_buffer), deformable_group=deformable_group, ) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) - 0.5 output_size = _conv_2d_output_size(size, kernel, pad_h, pad_w, 1, stride_h, stride_w) o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group) w = np.random.rand( output_channels, kernel, kernel, input_channels).astype(np.float32)\ - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NCHW": X = utils.NHWC2NCHW(X) w = utils.NHWC2NCHW(w) inputs = [X, o, w, b] if use_bias else [X, o, w] # Error handling path. if size + pad_h < kernel or size + pad_w < kernel: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return if input_channels % deformable_group != 0: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return if output_channels % deformable_group != 0: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def test_conv_gradients(self, stride, pad, kernel, dilation, size, input_channels, output_channels, batch_size, order, engine, use_bias, deformable_group, gc, dc): dkernel = dilation * (kernel - 1) + 1 if gc.device_type == caffe2_pb2.CUDA and engine == 'CUDNN': assume(_cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC'))) assume(engine != "MKLDNN" or use_bias is True) op = core.CreateOperator( "DeformConv", ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, deformable_group=deformable_group, ) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) - 0.5 output_size = _conv_2d_output_size(size, kernel, pad, pad, dilation, stride, stride) o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group) w = np.random.rand( output_channels, kernel, kernel, input_channels).astype(np.float32)\ - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NCHW": X = utils.NHWC2NCHW(X) w = utils.NHWC2NCHW(w) inputs = [X, o, w, b] if use_bias else [X, o, w] # Error handling path. if size + pad + pad < dkernel or size + pad + pad < dkernel: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return if input_channels % deformable_group != 0: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return if output_channels % deformable_group != 0: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def test_convolution_transpose_separate_stride_pad_adj_layout( self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel, adj_h, adj_w, size, input_channels, output_channels, batch_size, engine, use_bias, gc, dc): assume(adj_h < stride_h) assume(adj_w < stride_w) X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( input_channels, kernel, kernel, output_channels)\ .astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 outputs = {} for order in ["NCHW", "NHWC"]: op = core.CreateOperator( "ConvTranspose", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride_h=stride_h, stride_w=stride_w, kernel=kernel, pad_t=pad_t, pad_l=pad_l, pad_b=pad_b, pad_r=pad_r, adj_h=adj_h, adj_w=adj_w, order=order, engine=engine, device_option=gc, ) if order == "NCHW": X_f = utils.NHWC2NCHW(X) w_f = utils.NHWC2NCHW(w) else: X_f = X w_f = w self.assertDeviceChecks(dc, op, [X_f, w_f, b] if use_bias else [X_f, w_f], [0]) self.ws.create_blob("X").feed(X_f, device_option=gc) self.ws.create_blob("w").feed(w_f, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(op) outputs[order] = self.ws.blobs["Y"].fetch() output_h = (size - 1) * stride_h + kernel + adj_h - pad_t - pad_b output_w = (size - 1) * stride_w + kernel + adj_w - pad_l - pad_r self.assertEqual(outputs["NCHW"].shape, (batch_size, output_channels, output_h, output_w)) np.testing.assert_allclose(outputs["NCHW"], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4)
def test_convolution_transpose_layout(self, stride, pad, kernel, adj, size, input_channels, output_channels, batch_size, engine, shared_buffer, use_bias, gc, dc): assume(adj < stride) X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( input_channels, kernel, kernel, output_channels)\ .astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 outputs = {} for order in ["NCHW", "NHWC"]: if hiputl.run_in_hip(gc, dc) and order == "NHWC": # MIOPEN doesn't work with NHWC, fallback to use normal hip tmp_engine = "" else: tmp_engine = engine op = core.CreateOperator( "ConvTranspose", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], strides=[stride] * 2, kernels=[kernel] * 2, pads=[pad] * 4, adjs=[adj] * 2, order=order, engine=tmp_engine, shared_buffer=int(shared_buffer), device_option=gc, ) if order == "NCHW": X_f = utils.NHWC2NCHW(X) w_f = utils.NHWC2NCHW(w) else: X_f = X w_f = w self.assertDeviceChecks(dc, op, [X_f, w_f, b] if use_bias else [X_f, w_f], [0]) self.ws.create_blob("X").feed(X_f, device_option=gc) self.ws.create_blob("w").feed(w_f, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(op) outputs[order] = self.ws.blobs["Y"].fetch() output_size = (size - 1) * stride + kernel + adj - 2 * pad self.assertEqual( outputs["NCHW"].shape, (batch_size, output_channels, output_size, output_size)) np.testing.assert_allclose(outputs["NCHW"], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4)
def test_group_convolution(self, stride, pad, kernel, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, engine, use_bias, gc, dc): assume(size >= kernel) if hiputl.run_in_hip(gc, dc): if order == "NHWC": assume(group == 1 and engine != "CUDNN") else: # TODO: Group conv in NHWC not implemented for GPU yet. assume(group == 1 or order == "NCHW" or gc.device_type == caffe2_pb2.CPU) if group != 1 and order == "NHWC": dc = [d for d in dc if d.device_type == caffe2_pb2.CPU] # Group conv not implemented with EIGEN engine. assume(group == 1 or engine != "EIGEN") input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group op = core.CreateOperator( "Conv", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, kernel=kernel, pad=pad, order=order, engine=engine, group=group, ) X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( output_channels, kernel, kernel, input_channels_per_group).astype(np.float32)\ - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NCHW": X = utils.NHWC2NCHW(X) w = utils.NHWC2NCHW(w) inputs = [X, w, b] if use_bias else [X, w] self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def test_convolution_transpose_separate_stride_pad_adj_gradient( self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel, adj_h, adj_w, size, input_channels, output_channels, batch_size, order, engine, use_bias, compute_dX, gc, dc): assume(adj_h < stride_h) assume(adj_w < stride_w) X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( input_channels, kernel, kernel, output_channels)\ .astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 op = core.CreateOperator( "ConvTranspose", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride_h=stride_h, stride_w=stride_w, kernel=kernel, pad_t=pad_t, pad_l=pad_l, pad_b=pad_b, pad_r=pad_r, adj_h=adj_h, adj_w=adj_w, order=order, engine=engine, no_gradient_to_input=not compute_dX, ) if order == "NCHW": X = utils.NHWC2NCHW(X) w = utils.NHWC2NCHW(w) inputs = [X, w, b] if use_bias else [X, w] self.assertDeviceChecks(dc, op, inputs, [0]) if use_bias and compute_dX: # w, b, X outputs_to_check = [1, 2, 0] elif use_bias: # w, b outputs_to_check = [1, 2] elif compute_dX: # w, X outputs_to_check = [1, 0] else: # w outputs_to_check = [1] for i in outputs_to_check: self.assertGradientChecks(gc, op, inputs, i, [0])
def test_instance_norm_model_helper(self, N, C, H, W, order, epsilon, seed, is_test): np.random.seed(seed) model = model_helper.ModelHelper(name="test_model") brew.instance_norm(model, 'input', 'output', C, epsilon=epsilon, order=order, is_test=is_test) input_blob = np.random.rand(N, C, H, W).astype(np.float32) if order == 'NHWC': input_blob = utils.NCHW2NHWC(input_blob) self.ws.create_blob('input').feed(input_blob) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() if is_test: scale = self.ws.blobs['output_s'].fetch() assert scale is not None assert scale.shape == (C, ) bias = self.ws.blobs['output_b'].fetch() assert bias is not None assert bias.shape == (C, ) output_blob = self.ws.blobs['output'].fetch() if order == 'NHWC': output_blob = utils.NHWC2NCHW(output_blob) assert output_blob.shape == (N, C, H, W)
def test_pooling(self, stride, pad, kernel, size, input_channels, batch_size, order, op_type, engine, gc, dc): assume(pad < kernel) if hiputl.run_in_hip(gc, dc) and engine == "CUDNN": assume(order == "NCHW" and op_type != "LpPool") op = core.CreateOperator( op_type, ["X"], ["Y"], stride=stride, kernel=kernel, pad=pad, order=order, engine=engine, ) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0]) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0])
def test_pooling_separate_stride_pad(self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel, size, input_channels, batch_size, order, op_type, gc, dc): assume(np.max([pad_t, pad_l, pad_b, pad_r]) < kernel) op = core.CreateOperator( op_type, ["X"], ["Y"], stride_h=stride_h, stride_w=stride_w, pad_t=pad_t, pad_l=pad_l, pad_b=pad_b, pad_r=pad_r, kernel=kernel, order=order, ) X = np.random.rand(batch_size, size, size, input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0]) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0])
def test_pooling_3d(self, stride, pad, kernel, size, input_channels, batch_size, order, op_type, engine, gc, dc): assume(pad < kernel) assume(size + pad + pad >= kernel) # Currently MIOpen Pooling only supports pooling with NCHW order. if hiputl.run_in_hip(gc, dc) and (workspace.GetHIPVersion() < 303 or order == "NHWC"): assume(engine != "CUDNN") # some case here could be calculated with global pooling, but instead # calculated with general implementation, slower but should still # be correct. op = core.CreateOperator( op_type, ["X"], ["Y"], strides=[stride] * 3, kernels=[kernel] * 3, pads=[pad] * 6, order=order, engine=engine, ) X = np.random.rand(batch_size, size, size, size, input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
def test_convolution_transpose_gradients(self, stride, pad, kernel, adj, size, input_channels, output_channels, batch_size, order, engine, use_bias, compute_dX, gc, dc): assume(adj < stride) if hiputl.run_in_hip(gc, dc) and engine == "CUDNN": assume(order == "NCHW") X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( input_channels, kernel, kernel, output_channels)\ .astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 op = core.CreateOperator( "ConvTranspose", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, kernel=kernel, pad=pad, adj=adj, order=order, engine=engine, no_gradient_to_input=not compute_dX, ) if order == "NCHW": X = utils.NHWC2NCHW(X) w = utils.NHWC2NCHW(w) inputs = [X, w, b] if use_bias else [X, w] self.assertDeviceChecks(dc, op, inputs, [0]) if use_bias and compute_dX: # w, b, X outputs_to_check = [1, 2, 0] elif use_bias: # w, b outputs_to_check = [1, 2] elif compute_dX: # w, X outputs_to_check = [1, 0] else: # w outputs_to_check = [1] for i in outputs_to_check: self.assertGradientChecks(gc, op, inputs, i, [0])
def test_convolution_transpose_with_group(self, stride, pad, kernel, adj, size, input_channels, output_channels, batch_size, group, order, engine, shared_buffer, use_bias, gc, dc): assume(adj < stride) # TODO: Group conv_transpose in NHWC not implemented for GPU yet. assume(group == 1 or order == "NCHW" or gc.device_type == caffe2_pb2.CPU) if group != 1 and order == "NHWC": dc = [d for d in dc if d.device_type == caffe2_pb2.CPU] if hiputl.run_in_hip(gc, dc) and order == "NHWC": engine = "" op = core.CreateOperator( "ConvTranspose", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, kernel=kernel, pad=pad, adj=adj, group=group, order=order, engine=engine, shared_buffer=int(shared_buffer), device_option=gc, ) input_channels *= group output_channels *= group X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( input_channels, kernel, kernel, int(output_channels / group)) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NCHW": X = utils.NHWC2NCHW(X) w = utils.NHWC2NCHW(w) inputs = [X, w, b] if use_bias else [X, w] self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def channel_shuffle_ref(X): if order == "NHWC": X = utils.NHWC2NCHW(X) Y_r = X.reshape(X.shape[0], groups, X.shape[1] // groups, X.shape[2], X.shape[3]) Y_trns = Y_r.transpose((0, 2, 1, 3, 4)) Y_reshaped = Y_trns.reshape(X.shape) if order == "NHWC": Y_reshaped = utils.NCHW2NHWC(Y_reshaped) return Y_reshaped
def test_leaky_relu_layout(self, gc, dc, N, C, H, W, alpha, seed): outputs = {} for order in ('NCHW', 'NHWC'): np.random.seed(seed) input_blobs = self._get_inputs(N, C, H, W, order) self._feed_inputs(input_blobs, device_option=gc) op = self._get_op(device_option=gc, alpha=alpha, order=order) self.ws.run(op) outputs[order] = self.ws.blobs['output'].fetch() np.testing.assert_allclose(outputs['NCHW'], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4)
def test_pooling_with_index(self, stride, pad, kernel, size, input_channels, batch_size, gc, dc): assume(pad < kernel) op = core.CreateOperator( "MaxPoolWithIndex", ["X"], ["Y", "Y_index"], stride=stride, kernel=kernel, pad=pad, order="NCHW", deterministic=1, ) X = np.random.rand(batch_size, size, size, input_channels).astype(np.float32) # transpose due to order = NCHW X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0])
def test_pooling_1d(self, stride, pad, kernel, size, input_channels, batch_size, order, op_type, gc, dc): assume(pad < kernel) op = core.CreateOperator( op_type, ["X"], ["Y"], strides=[stride], kernels=[kernel], pads=[pad, pad], order=order, engine="", ) X = np.random.rand(batch_size, size, input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0]) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0])
def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean, store_inv_stdev, epsilon, seed): # force store_inv_stdev if store_mean to match existing forward pass # implementation store_inv_stdev |= store_mean outputs = {} for order in ('NCHW', 'NHWC'): np.random.seed(seed) input_blobs = self._get_inputs(N, C, H, W, order) self._feed_inputs(input_blobs, device_option=gc) op = self._get_op(device_option=gc, store_mean=store_mean, store_inv_stdev=store_inv_stdev, epsilon=epsilon, order=order) self.ws.run(op) outputs[order] = self.ws.blobs['output'].fetch() np.testing.assert_allclose(outputs['NCHW'], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4)
def test_leaky_relu_model_helper_helper(self, N, C, H, W, order, alpha, seed): np.random.seed(seed) arg_scope = {'order': order} model = model_helper.ModelHelper(name="test_model", arg_scope=arg_scope) model.LeakyRelu('input', 'output', alpha=alpha) input_blob = np.random.rand(N, C, H, W).astype(np.float32) if order == 'NHWC': input_blob = utils.NCHW2NHWC(input_blob) self.ws.create_blob('input').feed(input_blob) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() output_blob = self.ws.blobs['output'].fetch() if order == 'NHWC': output_blob = utils.NHWC2NCHW(output_blob) assert output_blob.shape == (N, C, H, W)
def test_global_pooling(self, size, input_channels, batch_size, order, op_type, engine, gc, dc): # CuDNN 5 does not support deterministic max pooling. assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool") if hiputl.run_in_hip(gc, dc) and engine == "CUDNN": assume(order == "NCHW" and op_type != "LpPool") op = core.CreateOperator( op_type, ["X"], ["Y"], order=order, engine=engine, global_pooling=True, ) X = np.random.rand(batch_size, size, size, input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0]) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0])
def test_global_pooling_3d(self, kernel, size, input_channels, batch_size, order, op_type, engine, gc, dc): # Currently MIOpen Pooling only supports 2d pooling if hiputl.run_in_hip(gc, dc): assume(engine != "CUDNN") # pad and stride ignored because they will be infered in global_pooling op = core.CreateOperator( op_type, ["X"], ["Y"], kernels=[kernel] * 3, order=order, global_pooling=True, engine=engine, ) X = np.random.rand(batch_size, size, size, size, input_channels).astype(np.float32) if order == "NCHW": X = utils.NHWC2NCHW(X) self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
def ref(input_blob, scale_blob, bias_blob): if order == 'NHWC': input_blob = utils.NHWC2NCHW(input_blob) mean_blob = input_blob.reshape((N, C, -1)).mean(axis=2) inv_stdev_blob = 1.0 / \ np.sqrt(input_blob.reshape((N, C, -1)).var(axis=2) + epsilon) # _bc indicates blobs that are reshaped for broadcast scale_bc = scale_blob[np.newaxis, :, np.newaxis, np.newaxis] mean_bc = mean_blob[:, :, np.newaxis, np.newaxis] inv_stdev_bc = inv_stdev_blob[:, :, np.newaxis, np.newaxis] bias_bc = bias_blob[np.newaxis, :, np.newaxis, np.newaxis] normalized_blob = scale_bc * (input_blob - mean_bc) * inv_stdev_bc \ + bias_bc if order == 'NHWC': normalized_blob = utils.NCHW2NHWC(normalized_blob) if not store_mean and not store_inv_stdev: return normalized_blob, elif not store_inv_stdev: return normalized_blob, mean_blob else: return normalized_blob, mean_blob, inv_stdev_blob
def test_groupwise_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max if not preserve_weight_sparsity: W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_groupwise_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, nbits_in_non_outlier, share_col_buffer, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group X_min = -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, nbits_in_non_outlier=nbits_in_non_outlier, engine=engine, group=group, quantize_groupwise=1, in_scale=x_q_param.scale, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs)
def lc_2d_nhwc(X, W, b=None): XT = utils.NHWC2NCHW(X) WT = np.transpose(W, [0, 1, 2, 5, 3, 4]) output = lc_2d_nchw(XT, WT, b) return [utils.NCHW2NHWC(output[0])]
def _test_dnnlowp_nd_int( self, stride, pad, kernels, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") ndim = len(kernels) X, W, b = generate_convnd_inputs( (stride, ) * ndim, (pad, ) * ndim, kernels, (dilation, ) * ndim, (size, ) * ndim, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC" if fall_back_to_NCHW: X_nchw = utils.NHWC2NCHW(X) W_nchw = utils.NHWC2NCHW(W) do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0 do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max()) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q") init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], strides=[stride] * ndim, kernels=kernels, dilations=[dilation] * ndim, pads=[pad] * (ndim * 2), order="NCHW" if fall_back_to_NCHW else order, dequantize_output=not do_dequantize, engine=engine, group=group, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X, device_option=gc) self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(init_net) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() if fall_back_to_NCHW: Y = utils.NCHW2NHWC(Y) outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs)
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, in_quantized, out_quantized, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = ( np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128 ) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( "DNNLOWP" in engine and weight_quantized and len(outputs) > 0 ) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) net.Proto().op.extend([int8_given_tensor_fill]) # Bias x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, dequantize_output=not do_dequantize, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc ) net.Proto().op.extend([dequantize]) self.ws.create_blob("X").feed(X, device_option=gc) self.ws.create_blob("W").feed(W, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(net) Y = self.ws.blobs["Y"].fetch() outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def canonical(o): if o.order == "NHWC": return utils.NHWC2NCHW(o.Y) else: return o.Y
def generate_convnd_inputs( strides, pads, kernels, dilations, sizes, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=False, preserve_activation_sparsity=False, preserve_weight_sparsity=False, ): dim = len(sizes) assume(all(len(a) == dim for a in [strides, pads, kernels, dilations])) assume( all(sizes[d] >= dilations[d] * (kernels[d] - 1) + 1 for d in range(dim))) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group depthwise_convolution = (input_channels_per_group == 1 and output_channels_per_group == 1) assert input_channels > 1 assert output_channels > 1 # X and W have scale 1, so exactly represented after quantization X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X_range = X_max - X_min if depthwise_convolution and groupwise_quantization: # For depthwise convolution, it's not enough to set input channel 0 # to all X_min to avoid overflow from vpmaddubsw X_range /= 2 X = np.round( np.random.rand(*((batch_size, ) + tuple(sizes) + (input_channels, ))) * X_range + X_min) X = X.astype(np.float32) if (batch_size != 0 and depthwise_convolution and groupwise_quantization and not preserve_activation_sparsity): # Put X_max in a position not to be paired with any padded value. # Put X_min to all positions that can be paired with the X_max value. # # This is an example of a pattern for 3x3x3 # . . . . . # . . . . . # . . . . . # . . . . . # . . . . min # # . . . . . # . . . . min # . min max min . # min . . . . # . . . . . # # min . . . . # . . . . . # . . . . . # . . . . . # . . . . . # Make sure we have enough dimension assert X.shape[1] >= 3 assert all(X.shape[d + 1] >= kernels[d] + 2 for d in range(1, dim)) # Take subtensor we want to manipulate X_sub = X[(0, ) * (X.ndim - dim - 1) + (slice(None), ) * dim + (0, )] # Put X_max in the middle of the subtensor X_sub[(1, ) + tuple(kernels[d] // 2 + 1 for d in range(1, dim))] = X_max # Put X_min to the positions that can be paired with X_max across # the slowest moving dimension X_sub[[[0, 2]] + [[kernels[d] + 1, 0] for d in range(1, dim)]] = X_min # Put X_min to other positions that can be paired with X_max for d1 in range(1, dim): X_sub[[[1]] + [[kernels[d2] // 2 + 1] for d2 in range(1, d1)] + [[kernels[d1] // 2, kernels[d1] // 2 + 2]] + [[kernels[d2] + 1, 0] for d2 in range(d1 + 1, dim)]] = X_min else: # input channel 0 is all X_min to avoid overflow from vpmaddubsw when # multiplied with W_min and W_max X[..., 0] = X_min if batch_size != 0: X[(0, ) * (X.ndim - 1) + (1, )] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(*((output_channels, ) + tuple(kernels) + (input_channels_per_group, ))) * (W_max - W_min) + W_min) W = W.astype(np.float32) if groupwise_quantization: for g in range(group): W[(g * output_channels_per_group, ) + (0, ) * (W.ndim - 1)] = W_min if depthwise_convolution: W[(g * output_channels_per_group, 1) + (0, ) * (W.ndim - 2)] = W_max else: assert output_channels_per_group > 1 W[(g * output_channels_per_group + 1, ) + (0, ) * (W.ndim - 1)] = W_max # Make sure each group has different ranges to really see the effect # of group-wise quantization. if not preserve_weight_sparsity: W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g else: W[(0, ) + (0, ) * (W.ndim - 1)] = W_min W[(1, ) + (0, ) * (W.ndim - 1)] = W_max different_range_per_group = groupwise_quantization and not preserve_weight_sparsity for g in range(group): avoid_vpmaddubsw_overflow( strides, pads, kernels, dilations, sizes, input_channels_per_group, output_channels_per_group, batch_size, X[..., g * input_channels_per_group:(g + 1) * input_channels_per_group], X_min, X_max, W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ], W_min + (g if different_range_per_group else 0), W_max + (g if different_range_per_group else 0), ) if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.random.randn(output_channels).astype(np.float32) return X, W, b
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, weight_quantized, prepack_weight, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_quantize_weight = "DNNLOWP" in engine and weight_quantized do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, nbits_in_non_outlier=nbits_in_non_outlier, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)