def set_add_in_scale(qrec):
    scaled_idx = qrec.cache.get('scaled_idx')
    if scaled_idx is None:
        scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0)
        qrec.cache['scaled_idx'] = scaled_idx
    compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1)
    scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q')
    if scale_in_mul_biases_q is None:
        scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
        qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q
    not_scaled_idx = 0 if scaled_idx else 1
    scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale
    scale_in_mul_biases_q.scale = scale

    if qrec.in_qs[0].asymmetric:
        # (C - Zc)*Sc = (A - Za)*Sa + (B - Zb)*Sb =
        # C = Sa/Sc*(A + B*Sb/Sa - Za - Zb*Sb/Sa) + Zc =
        #   = Sa/Sc*(A + B*Sb/Sa) + (Zc - Sa/Sc*(Za + Zb*Sb/Sa))
        #                           |---------- bias ----------|
        add_bias = qrec.out_qs[
            0].zero_point - qrec.cache['scale_mul_biases_q'].scale * (
                qrec.in_qs[not_scaled_idx].zero_point +
                scale_in_mul_biases_q.scale *
                qrec.in_qs[scaled_idx].zero_point)
    else:
        add_bias = 0
    qrec.cache['add_bias_offset'] = np.round(add_bias).astype(np.int16)
Beispiel #2
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)

        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]

        G = kwargs['G']
        # only attempt channel scaling if the second input is constant
        # if len(in_qs) > 2:
        in2_node, in_qs = cls.move_constant(G, fusion if fusion else params,
                                            in_qs)
        if in2_node:
            kwargs['graph_update']['requires_adjust'] = True
            in_q2 = QType.from_array_sq(arr=in2_node.dqvalue,
                                        quantized_dimension=0,
                                        dtype=np.int8,
                                        narrow_range=True,
                                        bits=8)
        else:
            in_q2 = in_qs[1].make_symmetric_signed()

        in_q1 = in_qs[0].make_symmetric_signed()

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.int8
            if o_q.dtype != np.int8 or o_q.asymmetric:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.asymmetric else "symmetric")
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype)
        if len(in_qs) == 3:
            biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale)
            out_in_qs = [in_q1, in_q2, biases_q]
        else:
            out_in_qs = [in_q1, in_q2]

        mul_biases_q = MultMulBiasScaleQType()
        mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale

        return QRec.scaled(in_qs=out_in_qs,
                           out_qs=[o_q],
                           mul_biases_q=mul_biases_q)
Beispiel #3
0
def set_add_in_scale(qrec):
    scaled_idx = qrec.cache.get('scaled_idx')
    if scaled_idx is None:
        scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0)
        qrec.cache['scaled_idx'] = scaled_idx
    compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1)
    scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q')
    if scale_in_mul_biases_q is None:
        scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
        qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q
    not_scaled_idx = 0 if scaled_idx else 1
    scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale
    scale_in_mul_biases_q.scale = scale
def compute_in_out_scale(qrec, in_idx=0, out_idx=0, extra_scale=1):
    if isinstance(in_idx, int):
        in_scale = qrec.in_qs[in_idx].scale
    else:
        in_scale = reduce(lambda x, y: x * y,
                          [qrec.in_qs[idx].scale for idx in in_idx])
    if isinstance(out_idx, int):
        out_scale = qrec.out_qs[out_idx].scale
    else:
        out_scale = reduce(lambda x, y: x * y,
                           [qrec.out_qs[idx].scale for idx in out_idx])
    scale_mul_biases_q = qrec.cache.get('scale_mul_biases_q')
    if scale_mul_biases_q is None:
        scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
        qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q

    scale = in_scale * extra_scale / out_scale
    scale_mul_biases_q.scale = scale
Beispiel #5
0
    def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8

        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]

        G = kwargs['G']
        # only attempt channel scaling if we have a bias
        in2_node, in_qs = cls.move_constant(G, fusion if fusion else params,
                                            in_qs)
        if not in2_node:
            raise ValueError(
                f"Not supported in NE16 this matmul {params.name}")

        w1, h1 = params.in_dims[0].shape[0], params.in_dims[0].shape[1]
        h2, w2 = params.in_dims[1].shape[0], params.in_dims[1].shape[1]
        h2_padded = roundup(h2, input_bits == 16)
        kwargs['graph_update']['requires_adjust'] = True
        in_q2 = QType.from_array_sq(arr=in2_node.dqvalue,
                                    quantized_dimension=0,
                                    dtype=np.uint8,
                                    narrow_range=True,
                                    bit_pack=opts['weight_bits'],
                                    no_compression=True,
                                    bits=opts['weight_bits'],
                                    resize=((h2, w2), (h2_padded, w2)))

        in_q1 = QType.from_min_max_sq(in_qs[0].min_val,
                                      in_qs[0].max_val,
                                      dtype=input_dtype,
                                      asymmetric=True)
        in_q1 = limit_input_precision(params, input_bits, in_q1, w1, False,
                                      opts['weight_bits'])

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.asymmetric else "symmetric")
        else:
            force_output_size = opts.get('force_output_size', 8)
            out_dtype = np.uint8 if force_output_size == 8 else np.uint16
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dont_copy_attr=['ne16'],
                                        asymmetric=True,
                                        dtype=out_dtype)
        if len(in_qs) == 3:
            biases_q = QType(dtype=np.int32,
                             scale=in_q1.scale * in_q2.scale,
                             ne16_biases=(input_bits != 16))
            # calculate bias offset - this will be added to the bias in the kernel
            # it is already in quantized form
            bias_offset = np.zeros((in2_node.dqvalue.shape[0], ),
                                   dtype=np.int32)
            if in_q1.zero_point != 0:
                # input zero correction is sum(W * Zin) by out_c if weights are channel scaled
                bias_offset -= np.sum(
                    np.multiply(in_q1.zero_point,
                                in2_node.value_as(in_q2).astype(np.int32) -
                                in_q2.zero_point,
                                dtype=np.int32),
                    dtype=np.int32,
                    axis=1)
            if o_q.zero_point != 0:
                # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled
                scale = o_q.scale / (in_q1.scale * in_q2.scale)
                bias_offset += np.floor((o_q.zero_point * scale) + 0.5).astype(
                    np.int32)
            if not np.all(bias_offset == 0):
                biases_q.offset = bias_offset
            out_in_qs = [in_q1, in_q2, biases_q]
        else:
            out_in_qs = [in_q1, in_q2]

        mul_biases_q = MultMulBiasScaleQType()
        mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale
        o_q.attr.ne16 = True

        if input_bits == 16:
            prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8)
        else:
            prenorm = 0
        mul_biases_q.pre_normalization = prenorm

        return QRec.scaled(in_qs=out_in_qs,
                           out_qs=[o_q],
                           mul_biases_q=mul_biases_q,
                           ne16=True)