Beispiel #1
0
def _do_bias_and_requantize(output, bias, input_scale, weight_scale,
                            output_scale, output_zero_point, with_relu):
    """Output processing for conv and linear"""
    # this is a vector for per channel case
    requant_input_scale = _expr.const(
        _get_numpy(input_scale) * _get_numpy(weight_scale))
    # Torch does bias add and requanize scale in fp32
    # refer to third_party/fbgemm/include/fbgemm/OutputProcessing-inl.h
    # Instead, we do bias add in int32 and use qnn requantize, which needs
    # integer input.
    # We observed no loss in accuracy in doing this way, and it is better
    # for tvm because bias quantization can be done at compile time
    # Instead, the torch way requires rounding of activation at runtime

    if bias is not None:
        requantize_input = _op.nn.bias_add(output, bias)
    else:
        requantize_input = output

    requantized = relay.qnn.op.requantize(
        requantize_input,
        requant_input_scale,
        relay.const(0, "int32"),
        output_scale,
        output_zero_point,
        out_dtype="int32",
        axis=1,
    )
    clip_min = 0
    if with_relu:
        clip_min = _get_scalar(output_zero_point)

    clip = _op.tensor.clip(requantized, clip_min, 255.0)
    return _op.cast(clip, dtype="uint8")
Beispiel #2
0
    def _impl(inputs, _):
        # refer to aten/src/ATen/native/quantized/cpu/qmul.cpp
        # math for calculating output scale and zp are already done
        # during _add_output_quant_params_to_scalar_op above
        assert len(inputs) == 6, "Input quant params not found in op inputs"
        other_val = inputs[1]  # scalar

        if other_val > 0.0:
            # only scale change
            return inputs[0]
        if other_val == 0.0:
            shape = infer_shape(inputs[0])
            return _op.full(_expr.const(0), shape, dtype="uint8")

        # negative scale case
        q_min = 0
        q_max = 255
        bias = _expr.const(q_max + q_min, dtype="int8")
        int8 = bias - _op.cast(inputs[0], "int8")
        return _op.cast(int8, "uint8")
Beispiel #3
0
    def _calculate_qparam(inp):
        # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp
        # ChooseQuantizationParams function
        mn = _op.min(inp)
        mx = _op.max(inp)

        # Ensure that the interval contains 0
        mn = _op.minimum(mn, _op.const(0.0, dtype="float32"))
        mx = _op.maximum(mx, _op.const(0.0, dtype="float32"))

        qmax = 255

        # reduce_range became True in v1.6
        if is_version_greater_than("1.5.1"):
            qmax = 127

        scale = (mx - mn) / _expr.const(qmax, dtype="float32")

        zero_point_from_min = -(mn / scale)
        zero_point = _op.cast(_op.round(_op.clip(zero_point_from_min, 0.0, qmax)), "int32")

        return scale, zero_point
Beispiel #4
0
def quantized_relu(data, input_zero_point):
    # refer to aten/src/ATen/native/quantized/cpu/qrelu.cpp
    zp = _op.cast(input_zero_point, dtype="uint8")
    return _op.tensor.maximum(data, zp)
Beispiel #5
0
def apply_with_upcast(data, func):
    inp = _op.cast(data, dtype="int32")
    out = func(inp)
    return _op.cast(out, "uint8")
Beispiel #6
0
 def get_reverse_mode_result(e, d, t):
     assert isinstance(t, TensorType)
     return op.cast(e * d, 'float32')
def quantized_adaptive_avg_2d(data, func_fp32):
    # this follows tflite impl
    inp = _op.cast(data, dtype="int32")
    out = func_fp32(inp)
    return _op.cast(out, "uint8")