Python add_quantization_param_argsの例

プログラミング言語: Python

名前空間/パッケージ名: caffe2.operators.quantized.server.utils

メソッド/関数: add_quantization_param_args

hotexamples.comのコード掲載数: 12

Python add_quantization_param_args - 12件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcaffe2.operators.quantized.server.utils.add_quantization_param_argsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

    def test_dnnlowp_group_norm(self, N, G, K, H, W, order, in_quantized,
                                out_quantized, weight_quantized, gc, dc):
        C = G * K

        X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0
        if order == "NHWC":
            X = np.transpose(X, [0, 2, 3, 1])
        gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0
        beta = np.random.randn(C).astype(np.float32) - 0.5

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [("GroupNorm", ""), ("GroupNorm", "DNNLOWP"),
                          ("Int8GroupNorm", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, gamma_q_param = (
                    dnnlowp_utils.create_int8_given_tensor_fill(
                        gamma, "gamma_q"))
                net.Proto().op.extend([int8_given_tensor_fill])

                X_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = (
                    dnnlowp_utils.create_int8_bias_tensor_fill(
                        beta, "beta_q", X_q_param, gamma_q_param))
                net.Proto().op.extend([int8_bias_tensor_fill])

            group_norm = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "gamma_q" if do_quantize_weight else "gamma",
                    "beta_q" if do_quantize_weight else "beta"
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=0 if do_dequantize else 1,
                group=G,
                order=order,
                is_test=True,
                engine=engine,
                device_option=gc,
            )

            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    group_norm, outputs[0][0])

            net.Proto().op.extend([group_norm])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Y_q"],
                    ["Y"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("gamma").feed(gamma, device_option=gc)
            self.ws.create_blob("beta").feed(beta, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs, atol_scale=2.0)

コード例 #2

ファイルを表示

ファイル: conv_dnnlowp_acc16_op_test.py プロジェクト: jenpnu/pytorch-large-model-support

    def test_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        nbits_in_non_outlier,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        if nbits_in_non_outlier == 0:
            X, W, b = generate_conv_inputs(
                stride,
                pad,
                kernel,
                dilation,
                size,
                group,
                input_channels_per_group,
                output_channels_per_group,
                batch_size,
                order,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
            )
        else:
            X_min = 0 if preserve_activation_sparsity else -77
            X_max = X_min + 255
            X = np.random.rand(batch_size, size, size,
                               input_channels) * 4 + X_min
            X = np.round(X).astype(np.float32)
            X[..., 0] = X_min
            X[0, 0, 0, 1] = X_max

            if preserve_weight_sparsity:
                W_min = -128
                W_max = 100
            else:
                W_min = -100
                W_max = W_min + 255
            W = (np.random.rand(output_channels, kernel, kernel,
                                input_channels_per_group) * 4 - 2 + W_min +
                 128)
            W = np.round(W).astype(np.float32)
            W[0, 0, 0, 0] = W_min
            W[1, 0, 0, 0] = W_max
            W[..., 1] = W_min + 128

            if order == "NCHW":
                X = nhwc2nchw(X)
                W = nhwc2nchw(W)

            # No input quantization error in bias
            b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = "DNNLOWP" in engine and weight_quantized

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)

コード例 #3

ファイルを表示

ファイル: conv_dnnlowp_acc16_op_test.py プロジェクト: jenpnu/pytorch-large-model-support

    def test_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[0, 0, 0, 0] = W_min
        W[1, 0, 0, 0] = W_max
        W[..., 1] = W_min + 128  # "zeros"

        if order == "NCHW":
            X = nhwc2nchw(X)
            W = nhwc2nchw(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = ("DNNLOWP" in engine and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)

コード例 #4

ファイルを表示

ファイル: conv_groupwise_dnnlowp_op_test.py プロジェクト: jenpnu/pytorch-large-model-support

    def test_groupwise_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            groupwise_quantization=True,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Y_q"],
                    ["Y"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)

コード例 #5

ファイルを表示

ファイル: fully_connected_dnnlowp_op_test.py プロジェクト: jenpnu/pytorch-large-model-support

    def test_dnnlowp_fully_connected_int(
        self,
        input_channels,
        output_channels,
        batch_size,
        in_quantized,
        out_quantized,
        weight_quantized,
        gc,
        dc,
    ):
        # X and W have scale 1, so exactly represented after quantization
        X_min = -77
        X_max = X_min + 255
        X = np.round(
            np.random.rand(batch_size, input_channels) * (X_max - X_min) +
            X_min)
        X = X.astype(np.float32)
        # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw
        # when multiplied with W_min and W_max
        X[:, 0] = X_min
        X[0, 1] = X_max

        W_min = -100
        W_max = W_min + 255
        W = np.round(
            np.random.rand(output_channels, input_channels) * (W_max - W_min) +
            W_min)
        W = W.astype(np.float32)
        W[0, 0] = W_min
        W[1, 0] = W_max

        # Make sure we won't have overflows from vpmaddubsw instruction used in
        # fbgemm
        avoid_vpmaddubsw_overflow_fc(
            batch_size,
            input_channels,
            output_channels,
            X,
            X_min,
            X_max,
            W,
            W_min,
            W_max,
        )

        b = np.random.randn(output_channels).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("FC", ""),
            ("FC", "DNNLOWP"),
            ("FC", "DNNLOWP_16"),
            ("Int8FC", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            fc = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0])
            net.Proto().op.extend([fc])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)

コード例 #6

ファイルを表示

    def test_dnnlowp_conv3d_int(
        self,
        stride,
        pad,
        temporal_kernels,
        spatial_kernels,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1
        kernels = (temporal_kernels,) + (spatial_kernels,) * 2
        ndim = len(kernels)

        X, W, b = generate_convnd_inputs(
            (stride,) * ndim,
            (pad,) * ndim,
            kernels,
            (dilation,) * ndim,
            (size,) * ndim,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
        )

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC"

            if fall_back_to_NCHW:
                X_nchw = nhwc2nchw(X)
                W_nchw = nhwc2nchw(W)

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q"
                )
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max()
                )
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * ndim,
                kernels=kernels,
                dilations=[dilation] * ndim,
                pads=[pad] * (ndim * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                dequantize_output=not do_dequantize,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(
                X_nchw if fall_back_to_NCHW else X, device_option=gc
            )
            self.ws.create_blob("W").feed(
                W_nchw if fall_back_to_NCHW else W, device_option=gc
            )
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = nchw2nhwc(Y)
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)

コード例 #7

ファイルを表示

    def test_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = (
                engine == "DNNLOWP" and weight_quantized and len(outputs) > 0
            )

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max()
                )
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)

コード例 #8

ファイルを表示

    def test_dnnlowp_depthwise_3x3_conv(
        self,
        stride,
        size,
        group,
        batch_size,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        relu,
        gc,
        dc,
    ):
        pad = 1
        kernel = 3
        dilation = 1
        input_channels_per_group = 1
        output_channels_per_group = 1
        order = "NHWC"

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        if relu:
            op_engine_list = [("Conv", ""), ("ConvRelu", "DNNLOWP"),
                              ("Int8ConvRelu", "DNNLOWP")]
        else:
            op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP"),
                              ("Int8Conv", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine

            preserve_activation_sparsity_int = 1 if preserve_activation_sparsity else 0
            preserve_weight_sparsity_int = 1 if preserve_weight_sparsity else 0

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=
                    preserve_activation_sparsity_int,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity_int,
                preserve_weight_sparsity=preserve_weight_sparsity_int,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize:
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])
            elif relu:
                relu_op = core.CreateOperator("Relu", ["Y"], ["Y"],
                                              engine=engine,
                                              device_option=gc)
                net.Proto().op.extend([relu_op])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)

コード例 #9

ファイルを表示

    def test_dnnlowp_depthwise_3x3x3_conv(
        self,
        stride,
        size,
        group,
        batch_size,
        fuse_relu,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        pad = 1
        kernel = 3
        dilation = 1
        input_channels_per_group = 1
        output_channels_per_group = 1
        order = "NHWC"

        X, W, b = generate_convnd_inputs(
            (stride, ) * 3,
            (pad, ) * 3,
            (kernel, ) * 3,
            (dilation, ) * 3,
            (size, ) * 3,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op = "ConvRelu" if fuse_relu else "Conv"
        op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            fall_back_to_NCHW = "DNNLOWP" not in engine

            if fall_back_to_NCHW:
                X_nchw = nhwc2nchw(X)
                W_nchw = nhwc2nchw(W)

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine

            preserve_activation_sparsity_int = 1 if preserve_activation_sparsity else 0
            preserve_weight_sparsity_int = 1 if preserve_weight_sparsity else 0

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=
                    preserve_activation_sparsity_int,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * 3,
                kernels=[kernel] * 3,
                dilations=[dilation] * 3,
                pads=[pad] * (3 * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity_int,
                preserve_weight_sparsity=preserve_weight_sparsity_int,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize:
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X,
                                          device_option=gc)
            self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W,
                                          device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = nchw2nhwc(Y)
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)

コード例 #10

ファイルを表示

ファイル: conv_groupwise_dnnlowp_acc16_op_test.py プロジェクト: jenpnu/pytorch-large-model-support

    def test_groupwise_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (
            np.random.rand(output_channels, kernel, kernel, input_channels_per_group)
            * 4
            - 2
            + W_min
            + 128
        )
        W = np.round(W).astype(np.float32)
        W[..., 1] = W_min + 128  # "zeros"
        for g in range(group):
            W[g * output_channels_per_group, 0, 0, 0] = W_min
            W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
            if not preserve_weight_sparsity:
                W[
                    g * output_channels_per_group : (g + 1) * output_channels_per_group,
                ] += g

        if order == "NCHW":
            X = nhwc2nchw(X)
            W = nhwc2nchw(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)

コード例 #11

ファイルを表示

ファイル: conv_groupwise_dnnlowp_acc16_op_test.py プロジェクト: jenpnu/pytorch-large-model-support

    def test_groupwise_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        nbits_in_non_outlier,
        share_col_buffer,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        if nbits_in_non_outlier == 0:
            X, W, b = generate_conv_inputs(
                stride,
                pad,
                kernel,
                dilation,
                size,
                group,
                input_channels_per_group,
                output_channels_per_group,
                batch_size,
                order,
                True,  # group-wise
            )
        else:
            X_min = -77
            X_max = X_min + 255
            X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
            X = np.round(X).astype(np.float32)
            X[..., 0] = X_min
            X[0, 0, 0, 1] = X_max

            W_min = -100
            W_max = W_min + 255
            W = (
                np.random.rand(
                    output_channels, kernel, kernel, input_channels_per_group
                )
                * 4
                - 2
                + W_min
                + 128
            )
            W = np.round(W).astype(np.float32)
            W[..., 1] = W_min + 128  # "zeros"
            for g in range(group):
                W[g * output_channels_per_group, 0, 0, 0] = W_min
                W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
                W[
                    g * output_channels_per_group : (g + 1) * output_channels_per_group,
                ] += g

            if order == "NCHW":
                X = nhwc2nchw(X)
                W = nhwc2nchw(W)

            # No input quantization error in bias
            b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)

コード例 #12

ファイルを表示

    def test_dnnlowp_batch_matmul_int_constant_B(self, m, n, k, C_1, C_2,
                                                 A_quantized, B_quantized,
                                                 out_quantized, gc, dc):
        batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2)))
        batch_dims_A = batch_dims[-C_1:]
        batch_dims_B = batch_dims[-C_2:]
        A = np.zeros(batch_dims_A + (m, k)).astype(np.float32)
        B = np.zeros(batch_dims_B + (n, k)).astype(np.float32)

        if np.prod(batch_dims) > 0:
            for index in np.ndindex(batch_dims_A):
                # When both input and output are float, each input of the batch has
                # scale 1 but with different offset, so input-wise quantization
                # shouldn't have any input quantization error
                # A_min = -77 if (A_quantized or out_quantized) else -77 + i
                A_min = -77
                A_max = A_min + 255
                A[index] = np.round(np.random.rand(m, k) * 255 + A_min)
                # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw
                # when multiplied with B_min and B_max
                A[index][:, 0] = A_min
                A[index][0, 1] = A_max

            i = 0
            for index in np.ndindex(batch_dims_B):
                # When weight is quantized in a lazy manner, each input of the batch has
                # scale 1 but with different offset, so input-wise quantization
                # shouldn't have any input quantization error when weight is quantized
                # in a lazy manner.
                B_min = -100 if B_quantized else -100 + i
                # B_min = -100
                B_max = B_min + 255
                B[index] = np.round(np.random.rand(n, k) * 255 + B_min)
                B[index][0, 0] = B_min
                B[index][1, 0] = B_max

                if C_1 > C_2:
                    # A has more dims
                    for outer_index in np.ndindex(batch_dims_A[:C_1 - C_2]):
                        avoid_vpmaddubsw_overflow_fc(
                            m,
                            k,
                            n,
                            A[outer_index] if C_2 == 0 else A[outer_index +
                                                              index],
                            A_min,
                            A_max,
                            B[index],
                            B_min,
                            B_max,
                        )
                else:
                    avoid_vpmaddubsw_overflow_fc(m, k, n, A[index[-C_1:]],
                                                 A_min, A_max, B[index], B_min,
                                                 B_max)
                i += 1

        for trans_a, trans_b in product([0, 1], [0, 1]):
            Output = collections.namedtuple("Output",
                                            ["Y", "op_type", "engine"])
            outputs = []

            op_engine_list = [
                ("BatchMatMul", ""),
                ("BatchMatMul", "DNNLOWP"),
                ("Int8BatchMatMul", "DNNLOWP"),
            ]

            for op_type, engine in op_engine_list:
                net = core.Net("test_net")

                do_quantize_A = "DNNLOWP" in engine and A_quantized
                do_quantize_B = "DNNLOWP" in engine and B_quantized
                do_dequantize = "DNNLOWP" in engine and out_quantized

                if do_quantize_A:
                    quantize_A = core.CreateOperator("Quantize", ["A"],
                                                     ["A_q"],
                                                     engine=engine,
                                                     device_option=gc)
                    net.Proto().op.extend([quantize_A])

                if do_quantize_B:
                    int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                        B if trans_b else B.swapaxes(-1, -2), "B_q")
                    net.Proto().op.extend([int8_given_tensor_fill])

                batch_matmul = core.CreateOperator(
                    op_type,
                    [
                        "A_q" if do_quantize_A else "A",
                        "B_q" if do_quantize_B else "B"
                    ],
                    ["Y_q" if do_dequantize else "Y"],
                    trans_a=trans_a,
                    trans_b=trans_b,
                    broadcast=True,
                    constant_B=True,
                    dequantize_output=not do_dequantize,
                    engine=engine,
                    device_option=gc,
                )
                if do_quantize_B:
                    # When quantized weight is provided, we can't rescale the
                    # output dynamically by looking at the range of output of each
                    # batch, so here we provide the range of output observed from
                    # fp32 reference implementation
                    dnnlowp_utils.add_quantization_param_args(
                        batch_matmul, outputs[0][0])
                net.Proto().op.extend([batch_matmul])

                if do_dequantize:
                    dequantize = core.CreateOperator("Dequantize", ["Y_q"],
                                                     ["Y"],
                                                     engine=engine,
                                                     device_option=gc)
                    net.Proto().op.extend([dequantize])

                self.ws.create_blob("A").feed(
                    A.swapaxes(-1, -2) if trans_a else A, device_option=gc)
                self.ws.create_blob("B").feed(
                    B if trans_b else B.swapaxes(-1, -2), device_option=gc)
                self.ws.run(net)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(),
                           op_type=op_type,
                           engine=engine))

            if np.prod(batch_dims) > 0:
                check_quantized_results_close(outputs)