Python NCHW2NHWCの例、caffe2.python.utils.NCHW2NHWC Pythonの例

コード例 #1

0

ファイルを表示

    def _nd_convolution(
        self,
        n,
        input_channels,
        output_channels,
        batch_size,
        stride,
        size,
        kernel,
        dilation,
        pad,
        order,
        use_bias,
        engine,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        dkernel = dilation * (kernel - 1) + 1
        for op_type in ["Conv", "Conv" + str(n) + "D"]:
            op = core.CreateOperator(
                op_type,
                ["X", "w", "b"] if use_bias else ["X", "w"],
                ["Y"],
                strides=[stride] * n,
                kernels=[kernel] * n,
                dilations=[dilation] * n,
                pads=[pad] * n * 2,
                order=order,
                engine=engine,
                force_algo_fwd=force_algo_fwd,
                force_algo_dgrad=force_algo_dgrad,
                force_algo_wgrad=force_algo_wgrad,
            )

            input_dims = [batch_size, input_channels]
            input_dims.extend([size] * n)
            filter_dims = [output_channels, input_channels]
            filter_dims.extend([kernel] * n)

            X = np.random.rand(*input_dims).astype(np.float32) - 0.5
            w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
            b = np.random.rand(output_channels).astype(np.float32) - 0.5
            if order == "NHWC":
                X = utils.NCHW2NHWC(X)
                w = utils.NCHW2NHWC(w)

            inputs = [X, w, b] if use_bias else [X, w]

            if size + pad + pad < dkernel or size + pad + pad < dkernel:
                with self.assertRaises(RuntimeError):
                    self.assertDeviceChecks(dc, op, inputs, [0])
                return

            self.assertDeviceChecks(dc, op, inputs, [0])
            for i in range(len(inputs)):
                self.assertGradientChecks(gc, op, inputs, i, [0])

コード例 #2

0

ファイルを表示

ファイル: deform_conv_test.py プロジェクト: PaulGureghian1/PyTorch

def _conv_2d_shuffle_offsets(
    batch_size,
    kernel,
    dims,
    num_deformable_group,
    input_channels,
    output_channels
):
    o = []
    w0 = [[0 for x in range(kernel)] for y in range(kernel)]
    for y0 in range(0, kernel):
        for x0 in range(0, kernel):
            x = np.random.randint(0, kernel)
            y = np.random.randint(0, kernel)
            o.append(y - y0)
            o.append(x - x0)
            w0[y][x] += 1
    o = o * num_deformable_group
    e = []
    for v in o:
        e.append([[v] * int(dims[1])] * int(dims[0]))
    w0 = [[w0] * input_channels] * output_channels
    return (
        np.array([e] * batch_size).astype(np.float32),
        utils.NCHW2NHWC(np.array(w0).astype(np.float32))
    )

コード例 #3

0

ファイルを表示

    def test_instance_norm_model_helper(self, N, C, H, W, order, epsilon, seed,
                                        is_test):
        np.random.seed(seed)
        model = model_helper.ModelHelper(name="test_model")
        brew.instance_norm(model,
                           'input',
                           'output',
                           C,
                           epsilon=epsilon,
                           order=order,
                           is_test=is_test)

        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
        if order == 'NHWC':
            input_blob = utils.NCHW2NHWC(input_blob)

        self.ws.create_blob('input').feed(input_blob)

        self.ws.create_net(model.param_init_net).run()
        self.ws.create_net(model.net).run()

        if is_test:
            scale = self.ws.blobs['output_s'].fetch()
            assert scale is not None
            assert scale.shape == (C, )
            bias = self.ws.blobs['output_b'].fetch()
            assert bias is not None
            assert bias.shape == (C, )

        output_blob = self.ws.blobs['output'].fetch()
        if order == 'NHWC':
            output_blob = utils.NHWC2NCHW(output_blob)

        assert output_blob.shape == (N, C, H, W)

コード例 #4

0

ファイルを表示

 def channel_shuffle_ref(X):
     if order == "NHWC":
         X = utils.NHWC2NCHW(X)
     Y_r = X.reshape(X.shape[0], groups, X.shape[1] // groups,
                     X.shape[2], X.shape[3])
     Y_trns = Y_r.transpose((0, 2, 1, 3, 4))
     Y_reshaped = Y_trns.reshape(X.shape)
     if order == "NHWC":
         Y_reshaped = utils.NCHW2NHWC(Y_reshaped)
     return Y_reshaped

コード例 #5

0

ファイルを表示

    def _get_inputs(self, N, C, H, W, order):
        input_data = np.random.rand(N, C, H, W).astype(np.float32)
        if order == 'NHWC':
            # Allocate in the same order as NCHW and transpose to make sure
            # the inputs are identical on freshly-seeded calls.
            input_data = utils.NCHW2NHWC(input_data)
        elif order != "NCHW":
            raise Exception('unknown order type ({})'.format(order))

        scale_data = np.random.rand(C).astype(np.float32)
        bias_data = np.random.rand(C).astype(np.float32)
        return input_data, scale_data, bias_data

コード例 #6

0

ファイルを表示

ファイル: leaky_relu_test.py プロジェクト: EmilioRivera/pytorch-bugs

    def _get_inputs(self, N, C, H, W, order):
        input_data = np.random.rand(N, C, H, W).astype(np.float32) - 0.5

        # default step size is 0.05
        input_data[np.logical_and(
            input_data >= 0, input_data <= 0.051)] = 0.051
        input_data[np.logical_and(
            input_data <= 0, input_data >= -0.051)] = -0.051

        if order == 'NHWC':
            input_data = utils.NCHW2NHWC(input_data)

        return input_data,

コード例 #7

0

ファイルを表示

    def test_channel_shuffle_fast_path(self, channels_per_group, n, gc, dc):
        order = "NHWC"
        groups = 4
        X = np.round(
            np.random.rand(n, channels_per_group * groups, 5, 6) * 255).astype(
                np.float32)
        if n != 0:
            X[0, 0, 0, 0] = 0
            X[0, 0, 0, 1] = 255
        X = utils.NCHW2NHWC(X)

        net = core.Net("test_net")

        quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                       engine="DNNLOWP")

        channel_shuffle = core.CreateOperator(
            "ChannelShuffle",
            ["X_q"],
            ["Y_q"],
            group=groups,
            kernel=1,
            order=order,
            engine="DNNLOWP",
        )

        dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                         engine="DNNLOWP")

        net.Proto().op.extend([quantize, channel_shuffle, dequantize])
        workspace.FeedBlob("X", X)
        workspace.RunNetOnce(net)
        Y = workspace.FetchBlob("Y")

        def channel_shuffle_ref(X):
            if order == "NHWC":
                X = utils.NHWC2NCHW(X)
            Y_r = X.reshape(X.shape[0], groups, X.shape[1] // groups,
                            X.shape[2], X.shape[3])
            Y_trns = Y_r.transpose((0, 2, 1, 3, 4))
            Y_reshaped = Y_trns.reshape(X.shape)
            if order == "NHWC":
                Y_reshaped = utils.NCHW2NHWC(Y_reshaped)
            return Y_reshaped

        Y_ref = channel_shuffle_ref(X)
        np.testing.assert_allclose(Y, Y_ref)

コード例 #8

0

ファイルを表示

    def test_spatialbn_test_mode_3d(self, size, input_channels, batch_size,
                                    seed, order, epsilon, inplace, engine, gc,
                                    dc):
        # Currently MIOPEN SpatialBN only supports 2D
        if hiputl.run_in_hip(gc, dc):
            assume(engine != "CUDNN")
        op = core.CreateOperator(
            "SpatialBN",
            ["X", "scale", "bias", "mean", "var"],
            ["X" if inplace else "Y"],
            order=order,
            is_test=True,
            epsilon=epsilon,
            engine=engine,
        )

        def reference_spatialbn_test(X, scale, bias, mean, var):
            if order == "NCHW":
                scale = scale[np.newaxis, :, np.newaxis, np.newaxis,
                              np.newaxis]
                bias = bias[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
                mean = mean[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
                var = var[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]

            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias, )

        np.random.seed(1701)
        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
        mean = np.random.randn(input_channels).astype(np.float32)
        var = np.random.rand(input_channels).astype(np.float32) + 0.5
        X = np.random.rand(batch_size, input_channels, size, size, size)\
            .astype(np.float32) - 0.5

        if order == "NHWC":
            X = utils.NCHW2NHWC(X)
        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
                                   reference_spatialbn_test)
        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])

コード例 #9

0

ファイルを表示

ファイル: leaky_relu_test.py プロジェクト: awthomp/pytorch-dev

    def test_leaky_relu_model_helper_helper(self, N, C, H, W, order, alpha,
                                            seed):
        np.random.seed(seed)
        arg_scope = {'order': order}
        model = model_helper.ModelHelper(name="test_model",
                                         arg_scope=arg_scope)
        model.LeakyRelu('input', 'output', alpha=alpha)

        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
        if order == 'NHWC':
            input_blob = utils.NCHW2NHWC(input_blob)

        self.ws.create_blob('input').feed(input_blob)

        self.ws.create_net(model.param_init_net).run()
        self.ws.create_net(model.net).run()

        output_blob = self.ws.blobs['output'].fetch()
        if order == 'NHWC':
            output_blob = utils.NHWC2NCHW(output_blob)

        assert output_blob.shape == (N, C, H, W)

コード例 #10

0

ファイルを表示

        def ref(input_blob, scale_blob, bias_blob):
            if order == 'NHWC':
                input_blob = utils.NHWC2NCHW(input_blob)

            mean_blob = input_blob.reshape((N, C, -1)).mean(axis=2)
            inv_stdev_blob = 1.0 / \
                np.sqrt(input_blob.reshape((N, C, -1)).var(axis=2) + epsilon)
            # _bc indicates blobs that are reshaped for broadcast
            scale_bc = scale_blob[np.newaxis, :, np.newaxis, np.newaxis]
            mean_bc = mean_blob[:, :, np.newaxis, np.newaxis]
            inv_stdev_bc = inv_stdev_blob[:, :, np.newaxis, np.newaxis]
            bias_bc = bias_blob[np.newaxis, :, np.newaxis, np.newaxis]
            normalized_blob = scale_bc * (input_blob - mean_bc) * inv_stdev_bc \
                + bias_bc

            if order == 'NHWC':
                normalized_blob = utils.NCHW2NHWC(normalized_blob)

            if not store_mean and not store_inv_stdev:
                return normalized_blob,
            elif not store_inv_stdev:
                return normalized_blob, mean_blob
            else:
                return normalized_blob, mean_blob, inv_stdev_blob

コード例 #11

0

ファイルを表示

ファイル: locally_connected_op_test.py プロジェクト: TheVinhLuong102/pytorch

 def lc_2d_nhwc(X, W, b=None):
     XT = utils.NHWC2NCHW(X)
     WT = np.transpose(W, [0, 1, 2, 5, 3, 4])
     output = lc_2d_nchw(XT, WT, b)
     return [utils.NCHW2NHWC(output[0])]

コード例 #12

0

ファイルを表示

ファイル: conv_dnnlowp_op_test.py プロジェクト: svseas/pytorch

    def _test_dnnlowp_nd_int(
        self,
        stride,
        pad,
        kernels,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        prepack_weight,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume((not prepack_weight) or order == "NHWC")
        ndim = len(kernels)

        X, W, b = generate_convnd_inputs(
            (stride, ) * ndim,
            (pad, ) * ndim,
            kernels,
            (dilation, ) * ndim,
            (size, ) * ndim,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"),
                          ("Int8Conv", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC"

            if fall_back_to_NCHW:
                X_nchw = utils.NHWC2NCHW(X)
                W_nchw = utils.NHWC2NCHW(W)

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            x_q_param = dnnlowp_utils.choose_quantization_params(
                X.min(), X.max())
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else
                    ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * ndim,
                kernels=kernels,
                dilations=[dilation] * ndim,
                pads=[pad] * (ndim * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                dequantize_output=not do_dequantize,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X,
                                          device_option=gc)
            self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W,
                                          device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = utils.NCHW2NHWC(Y)
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)

コード例 #13

0

ファイルを表示

ファイル: group_norm_dnnlowp_op_test.py プロジェクト: awthomp/pytorch-dev

    def test_dnnlowp_group_norm(
        self,
        N,
        G,
        K,
        H,
        W,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        gc,
        dc,
    ):
        C = G * K

        X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0
        if order == "NHWC":
            X = utils.NCHW2NHWC(X)
        gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0
        beta = np.random.randn(C).astype(np.float32) - 0.5

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("GroupNorm", ""),
            ("GroupNorm", "DNNLOWP"),
            ("Int8GroupNorm", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, gamma_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    gamma, "gamma_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                X_min = 0 if X.size == 0 else X.min()
                X_max = 0 if X.size == 0 else X.max()
                X_q_param = dnnlowp_utils.choose_quantization_params(
                    X_min, X_max)
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    beta, "beta_q", X_q_param, gamma_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            group_norm = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "gamma_q" if do_quantize_weight else "gamma",
                    "beta_q" if do_quantize_weight else "beta",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=0 if do_dequantize else 1,
                group=G,
                order=order,
                is_test=True,
                engine=engine,
                device_option=gc,
            )

            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    group_norm, outputs[0][0])

            net.Proto().op.extend([group_norm])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("gamma").feed(gamma, device_option=gc)
            self.ws.create_blob("beta").feed(beta, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs, atol_scale=2.0)

コード例 #14

0

ファイルを表示

    def _nd_convolution(
        self,
        n,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        stride,
        size,
        kernel,
        dilation,
        pad,
        group,
        order,
        use_bias,
        engine,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        # TODO: Group conv in NHWC not implemented for GPU yet.
        # TODO: Group 1D conv in NCHW not implemented for GPU yet.
        assume(
            group == 1
            or (n != 1 and order == "NCHW")
            or gc.device_type == caffe2_pb2.CPU
        )
        if group != 1 and (n == 1 or order == "NHWC"):
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]

        input_channels = group * input_channels_per_group
        output_channels = group * output_channels_per_group

        dkernel = dilation * (kernel - 1) + 1
        for op_type in ["Conv", "Conv" + str(n) + "D"]:
            op = core.CreateOperator(
                op_type,
                ["X", "w", "b"] if use_bias else ["X", "w"],
                ["Y"],
                strides=[stride] * n,
                kernels=[kernel] * n,
                dilations=[dilation] * n,
                pads=[pad] * n * 2,
                group=group,
                order=order,
                engine=engine,
                force_algo_fwd=force_algo_fwd,
                force_algo_dgrad=force_algo_dgrad,
                force_algo_wgrad=force_algo_wgrad,
            )

            input_dims = [batch_size, input_channels]
            input_dims.extend([size] * n)
            filter_dims = [output_channels, input_channels // group]
            filter_dims.extend([kernel] * n)

            X = np.random.rand(*input_dims).astype(np.float32) - 0.5
            w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
            b = np.random.rand(output_channels).astype(np.float32) - 0.5
            if order == "NHWC":
                X = utils.NCHW2NHWC(X)
                w = utils.NCHW2NHWC(w)

            inputs = [X, w, b] if use_bias else [X, w]

            if size + pad + pad < dkernel or size + pad + pad < dkernel:
                with self.assertRaises(RuntimeError):
                    self.assertDeviceChecks(dc, op, inputs, [0])
                return

            self.assertDeviceChecks(dc, op, inputs, [0])
            for i in range(len(inputs)):
                self.assertGradientChecks(gc, op, inputs, i, [0])

コード例 #15

0

ファイルを表示

ファイル: order_switch_test.py プロジェクト: liuluyang530/gpt-pytorch

 def nchw2nhwc_ref(X):
     return (utils.NCHW2NHWC(X), )

コード例 #16

0

ファイルを表示

    def test_dnnlowp_depthwise_3x3x3_conv(
        self,
        stride,
        size,
        group,
        batch_size,
        prepack_weight,
        fuse_relu,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        pad = 1
        kernel = 3
        dilation = 1
        input_channels_per_group = 1
        output_channels_per_group = 1
        order = "NHWC"

        X, W, b = generate_convnd_inputs(
            (stride,) * 3,
            (pad,) * 3,
            (kernel,) * 3,
            (dilation,) * 3,
            (size,) * 3,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op = "ConvRelu" if fuse_relu else "Conv"
        op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            # TODO: no fall back to NCHW
            fall_back_to_NCHW = "DNNLOWP" not in engine

            if fall_back_to_NCHW:
                X_nchw = utils.NHWC2NCHW(X)
                W_nchw = utils.NHWC2NCHW(W)
            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity
                )
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * 3,
                kernels=[kernel] * 3,
                dilations=[dilation] * 3,
                pads=[pad] * (3 * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(
                X_nchw if fall_back_to_NCHW else X, device_option=gc
            )
            self.ws.create_blob("W").feed(
                W_nchw if fall_back_to_NCHW else W, device_option=gc
            )
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = utils.NCHW2NHWC(Y)
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)