Ejemplo n.º 1
0
    def _test_dnnlowp_nd_int(
        self,
        stride,
        pad,
        kernels,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        ndim = len(kernels)

        X, W, b = generate_convnd_inputs(
            (stride, ) * ndim,
            (pad, ) * ndim,
            kernels,
            (dilation, ) * ndim,
            (size, ) * ndim,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"),
                          ("Int8Conv", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC"

            if fall_back_to_NCHW:
                X_nchw = nhwc2nchw(X)
                W_nchw = nhwc2nchw(W)

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * ndim,
                kernels=kernels,
                dilations=[dilation] * ndim,
                pads=[pad] * (ndim * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                dequantize_output=not do_dequantize,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X,
                                          device_option=gc)
            self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W,
                                          device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = nchw2nhwc(Y)
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)
Ejemplo n.º 2
0
    def test_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
Ejemplo n.º 3
0
    def test_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (
            np.random.rand(output_channels, kernel, kernel, input_channels_per_group)
            * 4
            - 2
            + W_min
            + 128
        )
        W = np.round(W).astype(np.float32)
        W[0, 0, 0, 0] = W_min
        W[1, 0, 0, 0] = W_max
        W[..., 1] = W_min + 128  # "zeros"

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (
                "DNNLOWP" in engine and weight_quantized and len(outputs) > 0
            )

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity
                )
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
Ejemplo n.º 4
0
    def test_dnnlowp_fully_connected_int(
        self,
        input_channels,
        output_channels,
        batch_size,
        in_quantized,
        out_quantized,
        weight_quantized,
        gc,
        dc,
    ):
        # X and W have scale 1, so exactly represented after quantization
        X_min = -77
        X_max = X_min + 255
        X = np.round(
            np.random.rand(batch_size, input_channels) * (X_max - X_min) +
            X_min)
        X = X.astype(np.float32)
        # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw
        # when multiplied with W_min and W_max
        X[:, 0] = X_min
        X[0, 1] = X_max

        W_min = -100
        W_max = W_min + 255
        W = np.round(
            np.random.rand(output_channels, input_channels) * (W_max - W_min) +
            W_min)
        W = W.astype(np.float32)
        W[0, 0] = W_min
        W[1, 0] = W_max

        # Make sure we won't have overflows from vpmaddubsw instruction used in
        # fbgemm
        avoid_vpmaddubsw_overflow_fc(
            batch_size,
            input_channels,
            output_channels,
            X,
            X_min,
            X_max,
            W,
            W_min,
            W_max,
        )

        b = np.random.randn(output_channels).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("FC", ""),
            ("FC", "DNNLOWP"),
            ("FC", "DNNLOWP_16"),
            ("Int8FC", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            fc = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0])
            net.Proto().op.extend([fc])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
Ejemplo n.º 5
0
    def test_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        prepack_weight,
        nbits_in_non_outlier,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        if nbits_in_non_outlier == 0:
            X, W, b = generate_conv_inputs(
                stride,
                pad,
                kernel,
                dilation,
                size,
                group,
                input_channels_per_group,
                output_channels_per_group,
                batch_size,
                order,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
            )
        else:
            X_min = 0 if preserve_activation_sparsity else -77
            X_max = X_min + 255
            X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
            X = np.round(X).astype(np.float32)
            X[..., 0] = X_min
            X[0, 0, 0, 1] = X_max

            if preserve_weight_sparsity:
                W_min = -128
                W_max = 100
            else:
                W_min = -100
                W_max = W_min + 255
            W = (
                np.random.rand(
                    output_channels, kernel, kernel, input_channels_per_group
                )
                * 4
                - 2
                + W_min
                + 128
            )
            W = np.round(W).astype(np.float32)
            W[0, 0, 0, 0] = W_min
            W[1, 0, 0, 0] = W_max
            W[..., 1] = W_min + 128

            # No input quantization error in bias
            b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = "DNNLOWP" in engine and weight_quantized
            do_prepack_weight = "DNNLOWP" in engine and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            x_q_param = dnnlowp_utils.choose_quantization_params(
                X.min(), X.max(), preserve_activation_sparsity
            )
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    nbits_in_non_outlier=nbits_in_non_outlier,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed"
                    if do_prepack_weight
                    else ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
    def test_dnnlowp_group_norm(
        self,
        N,
        G,
        K,
        H,
        W,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        gc,
        dc,
    ):
        C = G * K

        X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0
        if order == "NHWC":
            X = utils.NCHW2NHWC(X)
        gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0
        beta = np.random.randn(C).astype(np.float32) - 0.5

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("GroupNorm", ""),
            ("GroupNorm", "DNNLOWP"),
            ("Int8GroupNorm", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, gamma_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    gamma, "gamma_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                X_min = 0 if X.size == 0 else X.min()
                X_max = 0 if X.size == 0 else X.max()
                X_q_param = dnnlowp_utils.choose_quantization_params(
                    X_min, X_max)
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    beta, "beta_q", X_q_param, gamma_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            group_norm = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "gamma_q" if do_quantize_weight else "gamma",
                    "beta_q" if do_quantize_weight else "beta",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=0 if do_dequantize else 1,
                group=G,
                order=order,
                is_test=True,
                engine=engine,
                device_option=gc,
            )

            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    group_norm, outputs[0][0])

            net.Proto().op.extend([group_norm])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("gamma").feed(gamma, device_option=gc)
            self.ws.create_blob("beta").feed(beta, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs, atol_scale=2.0)
    def test_dnnlowp_fully_connected_int(
        self,
        input_channels,
        output_channels,
        batch_size,
        in_quantized,
        out_quantized,
        weight_quantized,
        prepack_weight,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        fuse_relu,
        output_packed_bias,
        gc,
        dc,
    ):
        # X and W have scale 1, so exactly represented after quantization
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.round(
            np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min
        )
        X = X.astype(np.float32)
        # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw
        # when multiplied with W_min and W_max
        X[:, 0] = X_min
        if batch_size != 0:
            X[0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = np.round(
            np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min
        )
        W = W.astype(np.float32)
        W[0, 0] = W_min
        W[1, 0] = W_max

        # Make sure we won't have overflows from vpmaddubsw instruction used in
        # fbgemm
        avoid_vpmaddubsw_overflow_fc(
            batch_size,
            input_channels,
            output_channels,
            X,
            X_min,
            X_max,
            W,
            W_min,
            W_max,
        )

        b = np.random.randn(output_channels).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [("FC", "")]
        if fuse_relu:
            op_engine_list += [("Int8FCRelu", "DNNLOWP")]
        else:
            op_engine_list += [
                ("FC", "DNNLOWP"),
                ("FC", "DNNLOWP_16"),
                ("Int8FC", "DNNLOWP"),
            ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (
                engine == "DNNLOWP" and weight_quantized and len(outputs) > 0
            )
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            X_min = 0 if X.size == 0 else X.min()
            X_max = 0 if X.size == 0 else X.max()
            x_q_param = dnnlowp_utils.choose_quantization_params(
                X_min, X_max, preserve_activation_sparsity
            )
            w_q_param = None
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8FCPackWeight",
                    inputs,
                    ["W_packed", "B_q32"] if do_dequantize and output_packed_bias else ["W_packed"],
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            fc = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed"
                    if do_prepack_weight
                    else ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=not do_dequantize,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                device_option=gc,
            )
            if do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    fc, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([fc])
            if fuse_relu and "DNNLOWP" not in engine:
                net.Relu(["Y"], "Y")

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(
                self, init_net, net, X, W, b, op_type, engine, None, gc, outputs
            )

            if output_packed_bias and do_prepack_weight and do_dequantize:
                bias_int32 = self.ws.blobs["B_q32"].fetch()
                if do_quantize_weight:
                    np.testing.assert_equal(bias_int32[0], np.round(b / (x_q_param.scale * w_q_param.scale)))
                np.testing.assert_equal(bias_int32[0].dtype, np.int32)

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
Ejemplo n.º 8
0
    def test_dnnlowp_batch_matmul_int_constant_B(
        self, m, n, k, C_1, C_2, A_quantized, B_quantized, out_quantized, gc, dc
    ):
        batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2)))
        batch_dims_A = batch_dims[-C_1:]
        batch_dims_B = batch_dims[-C_2:]
        A = np.zeros(batch_dims_A + (m, k)).astype(np.float32)
        B = np.zeros(batch_dims_B + (n, k)).astype(np.float32)

        if np.prod(batch_dims) > 0:
            for index in np.ndindex(batch_dims_A):
                # When both input and output are float, each input of the batch has
                # scale 1 but with different offset, so input-wise quantization
                # shouldn't have any input quantization error
                # A_min = -77 if (A_quantized or out_quantized) else -77 + i
                A_min = -77
                A_max = A_min + 255
                A[index] = np.round(np.random.rand(m, k) * 255 + A_min)
                # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw
                # when multiplied with B_min and B_max
                A[index][:, 0] = A_min
                A[index][0, 1] = A_max

            i = 0
            for index in np.ndindex(batch_dims_B):
                # When weight is quantized in a lazy manner, each input of the batch has
                # scale 1 but with different offset, so input-wise quantization
                # shouldn't have any input quantization error when weight is quantized
                # in a lazy manner.
                B_min = -100 if B_quantized else -100 + i
                # B_min = -100
                B_max = B_min + 255
                B[index] = np.round(np.random.rand(n, k) * 255 + B_min)
                B[index][0, 0] = B_min
                B[index][1, 0] = B_max

                if C_1 > C_2:
                    # A has more dims
                    for outer_index in np.ndindex(batch_dims_A[: C_1 - C_2]):
                        avoid_vpmaddubsw_overflow_fc(
                            m,
                            k,
                            n,
                            A[outer_index] if C_2 == 0 else A[outer_index + index],
                            A_min,
                            A_max,
                            B[index],
                            B_min,
                            B_max,
                        )
                else:
                    avoid_vpmaddubsw_overflow_fc(
                        m, k, n, A[index[-C_1:]], A_min, A_max, B[index], B_min, B_max
                    )
                i += 1

        for trans_a, trans_b in product([0, 1], [0, 1]):
            Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
            outputs = []

            op_engine_list = [
                ("BatchMatMul", ""),
                ("BatchMatMul", "DNNLOWP"),
                ("Int8BatchMatMul", "DNNLOWP"),
            ]

            for op_type, engine in op_engine_list:
                net = core.Net("test_net")

                do_quantize_A = "DNNLOWP" in engine and A_quantized
                do_quantize_B = "DNNLOWP" in engine and B_quantized
                do_dequantize = "DNNLOWP" in engine and out_quantized

                if do_quantize_A:
                    quantize_A = core.CreateOperator(
                        "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([quantize_A])

                if do_quantize_B:
                    int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                        B if trans_b else B.swapaxes(-1, -2), "B_q"
                    )
                    net.Proto().op.extend([int8_given_tensor_fill])

                batch_matmul = core.CreateOperator(
                    op_type,
                    ["A_q" if do_quantize_A else "A", "B_q" if do_quantize_B else "B"],
                    ["Y_q" if do_dequantize else "Y"],
                    trans_a=trans_a,
                    trans_b=trans_b,
                    broadcast=True,
                    constant_B=True,
                    dequantize_output=not do_dequantize,
                    engine=engine,
                    device_option=gc,
                )
                if do_quantize_B:
                    # When quantized weight is provided, we can't rescale the
                    # output dynamically by looking at the range of output of each
                    # batch, so here we provide the range of output observed from
                    # fp32 reference implementation
                    dnnlowp_utils.add_quantization_param_args(
                        batch_matmul, outputs[0][0]
                    )
                net.Proto().op.extend([batch_matmul])

                if do_dequantize:
                    dequantize = core.CreateOperator(
                        "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([dequantize])

                self.ws.create_blob("A").feed(
                    A.swapaxes(-1, -2) if trans_a else A, device_option=gc
                )
                self.ws.create_blob("B").feed(
                    B if trans_b else B.swapaxes(-1, -2), device_option=gc
                )
                self.ws.run(net)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)
                )

            if np.prod(batch_dims) > 0:
                check_quantized_results_close(outputs)
Ejemplo n.º 9
0
    def test_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        weight_quantized,
        prepack_weight,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume((not prepack_weight) or order == "NHWC")

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = (
                engine == "DNNLOWP" and weight_quantized and len(outputs) > 0
            )
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            x_q_param = dnnlowp_utils.choose_quantization_params(
                X.min(), X.max(), preserve_activation_sparsity
            )
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed"
                    if do_prepack_weight
                    else ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(
                self, init_net, net, X, W, b, op_type, engine, order, gc, outputs
            )

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)