Esempio n. 1
0
def set_add_in_scale(qrec):
    scaled_idx = qrec.cache.get('scaled_idx')
    if scaled_idx is None:
        scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0)
        qrec.cache['scaled_idx'] = scaled_idx
    compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1)
    scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q')
    if scale_in_mul_biases_q is None:
        scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
        qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q
    not_scaled_idx = 0 if scaled_idx else 1
    scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale
    scale_in_mul_biases_q.scale = scale

    if qrec.in_qs[0].asymmetric:
        # (C - Zc)*Sc = (A - Za)*Sa + (B - Zb)*Sb =
        # C = Sa/Sc*(A + B*Sb/Sa - Za - Zb*Sb/Sa) + Zc =
        #   = Sa/Sc*(A + B*Sb/Sa) + (Zc - Sa/Sc*(Za + Zb*Sb/Sa))
        #                           |---------- bias ----------|
        add_bias = qrec.out_qs[
            0].zero_point - qrec.cache['scale_mul_biases_q'].scale * (
                qrec.in_qs[not_scaled_idx].zero_point +
                scale_in_mul_biases_q.scale *
                qrec.in_qs[scaled_idx].zero_point)
    else:
        add_bias = 0
    qrec.cache['add_bias_offset'] = np.round(add_bias).astype(np.int16)
Esempio n. 2
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)

        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]

        G = kwargs['G']
        # only attempt channel scaling if the second input is constant
        # if len(in_qs) > 2:
        in2_node, in_qs = cls.move_constant(G, fusion if fusion else params,
                                            in_qs)
        if in2_node:
            kwargs['graph_update']['requires_adjust'] = True
            in_q2 = QType.from_array_sq(arr=in2_node.dqvalue,
                                        quantized_dimension=0,
                                        dtype=np.int8,
                                        narrow_range=True,
                                        bits=8)
        else:
            in_q2 = in_qs[1].make_symmetric_signed()

        in_q1 = in_qs[0].make_symmetric_signed()

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.int8
            if o_q.dtype != np.int8 or o_q.asymmetric:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.asymmetric else "symmetric")
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype)
        if len(in_qs) == 3:
            biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale)
            out_in_qs = [in_q1, in_q2, biases_q]
        else:
            out_in_qs = [in_q1, in_q2]

        mul_biases_q = MultMulBiasScaleQType()
        mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale

        return QRec.scaled(in_qs=out_in_qs,
                           out_qs=[o_q],
                           mul_biases_q=mul_biases_q)
Esempio n. 3
0
def set_add_in_scale(qrec):
    scaled_idx = qrec.cache.get('scaled_idx')
    if scaled_idx is None:
        scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0)
        qrec.cache['scaled_idx'] = scaled_idx
    compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1)
    scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q')
    if scale_in_mul_biases_q is None:
        scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
        qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q
    not_scaled_idx = 0 if scaled_idx else 1
    scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale
    scale_in_mul_biases_q.scale = scale
Esempio n. 4
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        G = kwargs['G']
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        min_val, max_val = None, None
        weights_q = QType.from_array_sq(
            arr=weights_node.dqvalue,
            quantized_dimension=cls.get_quantized_dimension(params, opts),
            dtype=np.int8,
            narrow_range=opts['narrow_weights'])
        if fusion and fusion.fusion_type in [
                'conv_active_pool', 'conv_active'
        ]:
            stats = kwargs['all_stats'][NodeId(fusion,
                                               fusion.contained_nodes()[0])]

            if isinstance(
                    fusion.contained_nodes()[1],
                (SigmoidActivationParameters, TanHActivationParameters,
                 HSwishActivationParameters)):
                stats = kwargs['all_stats'][NodeId(
                    fusion,
                    fusion.contained_nodes()[0])]
            elif fusion and isinstance(fusion.contained_nodes()[1],
                                       HSigmoidActivationParameters):
                # Hard sigmoid implements a RELU, be sure 6 can be representable
                min_val, max_val = 0, 6
            else:
                # Take stats from activation after the convolution
                stats = kwargs['all_stats'][NodeId(
                    fusion,
                    fusion.contained_nodes()[1])]

        if min_val is None or max_val is None:
            min_val, max_val = stats['range_out'][0]['min'], stats[
                'range_out'][0]['max']

        if force_out_q:
            o_q = force_out_q
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype)
        biases_q = QType(dtype=np.int32,
                         scale=weights_q.scale * in_qs[0].scale)
        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_qs[0], weights_q, o_q, params)
        # returning the new weights and biases qs will force backprop
        # TODO - ACC_Q LOOKS WRONG AFTER THIS
        return MultScalableFilterQuantizationRecord(
            in_qs=[in_qs[0], weights_q, biases_q],
            out_qs=[o_q],
            acc_q=biases_q,
            calc_q=biases_q,
            mul_biases_q=mul_biases_q)
Esempio n. 5
0
def compute_in_out_scale(qrec, in_idx=0, out_idx=0, extra_scale=1):
    if isinstance(in_idx, int):
        in_scale = qrec.in_qs[in_idx].scale
    else:
        in_scale = reduce(lambda x, y: x * y,
                          [qrec.in_qs[idx].scale for idx in in_idx])
    if isinstance(out_idx, int):
        out_scale = qrec.out_qs[out_idx].scale
    else:
        out_scale = reduce(lambda x, y: x * y,
                           [qrec.out_qs[idx].scale for idx in out_idx])
    scale_mul_biases_q = qrec.cache.get('scale_mul_biases_q')
    if scale_mul_biases_q is None:
        scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
        qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q

    scale = in_scale * extra_scale / out_scale
    scale_mul_biases_q.scale = scale
Esempio n. 6
0
def set_ssd_scales(qrec, params):
    offset_q = qrec.in_qs[0]
    anchors_q = qrec.in_qs[2]
    out_boxes_q = qrec.out_qs[0]
    for k in [
            'scale_x_q', 'scale_x_anc_q', 'scale_y_q', 'scale_y_anc_q',
            'scale_h_q', 'scale_w_q', 'scale_ao_q'
    ]:
        if k not in qrec.cache:
            qrec.cache[k] = MultMulBiasScaleQType(dtype=np.uint8)

    qrec.cache['scale_x_q'].scale = (offset_q.scale * anchors_q.scale) / \
        (out_boxes_q.scale * params.x_scale)
    qrec.cache['scale_x_anc_q'].scale = params.x_scale / offset_q.scale
    qrec.cache['scale_y_q'].scale = (offset_q.scale * anchors_q.scale) / \
        (out_boxes_q.scale * params.y_scale)
    qrec.cache['scale_y_anc_q'].scale = params.y_scale / offset_q.scale
    qrec.cache['scale_h_q'].scale = offset_q.scale / params.h_scale
    qrec.cache['scale_w_q'].scale = offset_q.scale / params.w_scale
    qrec.cache['scale_ao_q'].scale = anchors_q.scale * \
        2**(-15) / out_boxes_q.scale
Esempio n. 7
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        if force_out_qs and any(force_out_q is not None
                                for force_out_q in force_out_qs):
            return None
        in_qs = deepcopy(in_qs)
        # qrecs = kwargs['qrecs']
        G = kwargs['G']

        o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'],
                                    max_val=stats['range_out'][0]['max'],
                                    dtype=out_dtype)

        # input_nodes = {GRUParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
        #                for edge in G.in_edges(params.name)
        #                if isinstance(edge.from_node, ConstantInputParameters)}

        names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)}

        # quantization_mode: extended, autotiler
        # state_width: 16bit or 8bit
        # if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
        #     LOG.info(
        #         "node %s has similar input and i_state scales --> "
        #         "will be generated the same_scale kernel with better performance", params.name)
        #     params.rnn_same_inout_scale = True
        #     G.node_options[NodeId(params)] = params.at_options

        if params.rnn_same_inout_scale:
            wWz_scale = rWz_scale = np.maximum(in_qs[names['w_2_z_w']].scale,
                                               in_qs[names['r_2_z_w']].scale)
            wWr_scale = rWr_scale = np.maximum(in_qs[names['w_2_r_w']].scale,
                                               in_qs[names['r_2_r_w']].scale)
            wWh_scale = rWh_scale = np.maximum(in_qs[names['w_2_h_w']].scale,
                                               in_qs[names['r_2_h_w']].scale)
            i_2_z_WR_q = i_2_r_WR_q = i_2_h_WR_q = None
            in_q = state_q = QType(bits=8, q=7, signed=True, dtype=np.int8)
            in_scale = state_scale = in_q.scale
        else:
            wWz_scale = in_qs[names['w_2_z_w']].scale
            wWr_scale = in_qs[names['w_2_r_w']].scale
            wWh_scale = in_qs[names['w_2_h_w']].scale
            rWz_scale = in_qs[names['r_2_z_w']].scale
            rWr_scale = in_qs[names['r_2_r_w']].scale
            rWh_scale = in_qs[names['r_2_h_w']].scale
            in_scale = in_qs[0].scale
            in_q = in_qs[0]
            state_q = QType(bits=8, q=7, signed=True, dtype=np.int8)
            state_scale = state_q.scale
        i_2_z_WR_q = MultMulBiasScaleQType(scale=(wWz_scale * in_scale) /
                                           (rWz_scale * state_scale))
        i_2_r_WR_q = MultMulBiasScaleQType(scale=(wWr_scale * in_scale) /
                                           (rWr_scale * state_scale))
        i_2_h_WR_q = MultMulBiasScaleQType(scale=(wWh_scale * in_scale) /
                                           (rWh_scale * state_scale))

        i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32)
        h_WR_2_int_q = MultMulBiasScaleQType(scale=(rWh_scale * state_scale) /
                                             i_qtype.scale)
        r_WR_2_int_q = MultMulBiasScaleQType(scale=(rWr_scale * state_scale) /
                                             i_qtype.scale)
        z_WR_2_int_q = MultMulBiasScaleQType(scale=(rWz_scale * state_scale) /
                                             i_qtype.scale)

        if not params.rnn_states_as_inputs:
            in_qs[names['h_state']].scale = state_q.scale
            # cls.rescale_constant(input_nodes['h_state'], state_q.scale, qrecs)
        in_qs[0].scale = in_scale
        o_q.scale = state_scale

        in_qs[names['z_b']].scale = in_scale * rWz_scale
        in_qs[names['z_b']].dtype = BIAS_DTYPE
        # cls.rescale_constant(input_nodes['z_b'], in_scale * rWz_scale, qrecs, dtype=BIAS_DTYPE)
        in_qs[names['r_b']].scale = in_scale * rWr_scale
        in_qs[names['r_b']].dtype = BIAS_DTYPE
        # cls.rescale_constant(input_nodes['r_b'], in_scale * rWr_scale, qrecs, dtype=BIAS_DTYPE)
        in_qs[names['w_h_b']].scale = in_scale * wWh_scale
        in_qs[names['w_h_b']].dtype = BIAS_DTYPE
        # cls.rescale_constant(input_nodes['w_h_b'], in_scale * wWh_scale, qrecs, dtype=BIAS_DTYPE)
        in_qs[names['r_h_b']].scale = in_scale * rWh_scale
        in_qs[names['r_h_b']].dtype = BIAS_DTYPE
        # cls.rescale_constant(input_nodes['r_h_b'], state_scale * rWh_scale, qrecs, dtype=BIAS_DTYPE)

        in_qs[names['w_2_z_w']].scale = wWz_scale
        in_qs[names['w_2_z_w']].dtype = WEIGHTS_DTYPE
        # cls.rescale_constant(input_nodes['w_2_z_w'], wWz_scale, qrecs, dtype=WEIGHTS_DTYPE)
        in_qs[names['w_2_r_w']].scale = wWr_scale
        in_qs[names['w_2_r_w']].dtype = WEIGHTS_DTYPE
        # cls.rescale_constant(input_nodes['w_2_r_w'], wWr_scale, qrecs, dtype=WEIGHTS_DTYPE)
        in_qs[names['w_2_h_w']].scale = wWh_scale
        in_qs[names['w_2_h_w']].dtype = WEIGHTS_DTYPE
        # cls.rescale_constant(input_nodes['w_2_h_w'], wWh_scale, qrecs, dtype=WEIGHTS_DTYPE)
        in_qs[names['r_2_z_w']].scale = rWz_scale
        in_qs[names['r_2_z_w']].dtype = WEIGHTS_DTYPE
        # cls.rescale_constant(input_nodes['r_2_z_w'], rWz_scale, qrecs, dtype=WEIGHTS_DTYPE)
        in_qs[names['r_2_r_w']].scale = rWr_scale
        in_qs[names['r_2_r_w']].dtype = WEIGHTS_DTYPE
        # cls.rescale_constant(input_nodes['r_2_r_w'], rWr_scale, qrecs, dtype=WEIGHTS_DTYPE)
        in_qs[names['r_2_h_w']].scale = rWh_scale
        in_qs[names['r_2_h_w']].dtype = WEIGHTS_DTYPE
        # cls.rescale_constant(input_nodes['r_2_h_w'], rWh_scale, qrecs, dtype=WEIGHTS_DTYPE)

        return MultScalableGRUQuantizationRecord(in_qs=in_qs,
                                                 out_qs=[o_q],
                                                 i_2_z_WR_q=i_2_z_WR_q,
                                                 i_2_r_WR_q=i_2_r_WR_q,
                                                 i_2_h_WR_q=i_2_h_WR_q,
                                                 h_WR_2_int_q=h_WR_2_int_q,
                                                 r_WR_2_int_q=r_WR_2_int_q,
                                                 z_WR_2_int_q=z_WR_2_int_q,
                                                 i_qtype=i_qtype,
                                                 scales={
                                                     'w_2_z_w': wWz_scale,
                                                     'w_2_r_w': wWr_scale,
                                                     'w_2_h_w': wWh_scale,
                                                     'r_2_z_w': rWz_scale,
                                                     'r_2_r_w': rWr_scale,
                                                     'r_2_h_w': rWh_scale,
                                                     'in': [in_scale],
                                                     'state': state_scale,
                                                     'out': [state_scale]
                                                 })
Esempio n. 8
0
    def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8

        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]

        G = kwargs['G']
        # only attempt channel scaling if we have a bias
        in2_node, in_qs = cls.move_constant(G, fusion if fusion else params,
                                            in_qs)
        if not in2_node:
            raise ValueError(
                f"Not supported in NE16 this matmul {params.name}")

        w1, h1 = params.in_dims[0].shape[0], params.in_dims[0].shape[1]
        h2, w2 = params.in_dims[1].shape[0], params.in_dims[1].shape[1]
        h2_padded = roundup(h2, input_bits == 16)
        kwargs['graph_update']['requires_adjust'] = True
        in_q2 = QType.from_array_sq(arr=in2_node.dqvalue,
                                    quantized_dimension=0,
                                    dtype=np.uint8,
                                    narrow_range=True,
                                    bit_pack=opts['weight_bits'],
                                    no_compression=True,
                                    bits=opts['weight_bits'],
                                    resize=((h2, w2), (h2_padded, w2)))

        in_q1 = QType.from_min_max_sq(in_qs[0].min_val,
                                      in_qs[0].max_val,
                                      dtype=input_dtype,
                                      asymmetric=True)
        in_q1 = limit_input_precision(params, input_bits, in_q1, w1, False,
                                      opts['weight_bits'])

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.asymmetric else "symmetric")
        else:
            force_output_size = opts.get('force_output_size', 8)
            out_dtype = np.uint8 if force_output_size == 8 else np.uint16
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dont_copy_attr=['ne16'],
                                        asymmetric=True,
                                        dtype=out_dtype)
        if len(in_qs) == 3:
            biases_q = QType(dtype=np.int32,
                             scale=in_q1.scale * in_q2.scale,
                             ne16_biases=(input_bits != 16))
            # calculate bias offset - this will be added to the bias in the kernel
            # it is already in quantized form
            bias_offset = np.zeros((in2_node.dqvalue.shape[0], ),
                                   dtype=np.int32)
            if in_q1.zero_point != 0:
                # input zero correction is sum(W * Zin) by out_c if weights are channel scaled
                bias_offset -= np.sum(
                    np.multiply(in_q1.zero_point,
                                in2_node.value_as(in_q2).astype(np.int32) -
                                in_q2.zero_point,
                                dtype=np.int32),
                    dtype=np.int32,
                    axis=1)
            if o_q.zero_point != 0:
                # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled
                scale = o_q.scale / (in_q1.scale * in_q2.scale)
                bias_offset += np.floor((o_q.zero_point * scale) + 0.5).astype(
                    np.int32)
            if not np.all(bias_offset == 0):
                biases_q.offset = bias_offset
            out_in_qs = [in_q1, in_q2, biases_q]
        else:
            out_in_qs = [in_q1, in_q2]

        mul_biases_q = MultMulBiasScaleQType()
        mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale
        o_q.attr.ne16 = True

        if input_bits == 16:
            prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8)
        else:
            prenorm = 0
        mul_biases_q.pre_normalization = prenorm

        return QRec.scaled(in_qs=out_in_qs,
                           out_qs=[o_q],
                           mul_biases_q=mul_biases_q,
                           ne16=True)
Esempio n. 9
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0)
        if in_qs is None:
            return None
        in_qs = deepcopy(in_qs)
        G = kwargs['G']
        opts = kwargs['opts']

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')
        o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'],
                                    max_val=stats['range_out'][0]['max'],
                                    dtype=out_dtype)
        if force_out_qs and force_out_qs[0]:
            LOG.warning(
                'on node %s output is being forced from scale %s -> %s',
                params.name, o_q.scale, force_out_qs[0].scale)
            o_q = force_out_qs[0]

        names = {
            val: idx
            for idx, val in enumerate(LSTMParameters.INPUT_NAMES)
        }
        cell_range = stats.get('range_cell')
        if cell_range is None:
            ValueError(f'cell range not present in stats for {params.name}')
        cell_stat = max(abs(cell_range[var]) for var in ['min', 'max'])
        if params.cell_clip and not params.quant_c_state_with_stat:
            cell_max = params.cell_clip
            ratio_c = cell_max / cell_stat
            if not (ratio_c > 0.9 and ratio_c < 1.1):
                LOG.warning(
                    f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated "
                    f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} "
                    "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated"
                )
        else:
            cell_max = cell_stat

        cell_int_bits = calc_bits(cell_max)

        in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max)
        LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max,
                  in_qs[names['c_state']].range)
        int2_scale = int3_scale = out_tanh_sig_scale = None
        if params.hard_act:
            # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10
            # but also (internal_q * 2) + cell_bits = 32
            int_q = min((16 - cell_int_bits), 10)
            int2_scale = math.pow(2, -(int_q * 2))
            int3_scale = math.pow(2, -(int_q * 3))
        else:
            int_q = 12
            # output of LUT activations are always Q15
            out_tanh_sig_scale = math.pow(2, -15)
        int_scale = math.pow(2, -int_q)

        scale_pairs = {
            chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan)
            for chan in ['i', 'o', 'c', 'f']
        }
        for weight_name in [
                weight_name for scale_pair in scale_pairs.values()
                for weight_name in scale_pair
        ]:
            in_qs[names[weight_name]] = deepcopy(in_qs[names[weight_name]])
            in_qs[names[weight_name]].dtype = np.int8
            in_qs[names[weight_name]].bits = opts['weight_bits']

        w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale)
                    for k, (namei, namer) in scale_pairs.items()]
        if (abs(1 - in_qs[0].scale / o_q.scale) < 0.1) and \
                all([(abs(1 - w_scale[0] / w_scale[1]) < 0.2) for w_scale in w_scales]):
            LOG.info(
                "node %s has similar input and i_state scales --> "
                "will be generated the same_scale kernel with better performances",
                params.name)
            params.rnn_same_inout_scale = True
            G.node_options[NodeId(params)] = params.at_options

        if params.rnn_same_inout_scale:
            if not (abs(1 - in_qs[0].scale / o_q.scale) < 0.1) and \
               not all([(abs(1 - w_scale[0] / w_scale[1]) < 0.1) for w_scale in w_scales]):
                LOG.warning(
                    "node %s has different input and i_state scales consider using the "
                    "LSTM kernel with rnn_same_inout_scale=False (better accuracy)",
                    params.name)
            # in and out and state are all in the same scale
            in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale)
            # i_state scale may be 1 since the value is 0
            # np.maximum(in_and_out_scale, in_qs[names['i_state']].scale)
            i_state_scale = in_scale = in_and_out_scale
            in_qs[0].scale = in_scale
            o_q.scale = in_scale
            scales = {
                k: np.maximum(in_qs[names[namei]].scale,
                              in_qs[names[namer]].scale)
                for k, (namei, namer) in scale_pairs.items()
            }
            for k, (namei, namer) in scale_pairs.items():
                in_qs[names[namei]].scale = scales[k]
                in_qs[names[namer]].scale = scales[k]
        else:
            in_scale = in_qs[0].scale
            i_state_scale = o_q.scale
            o_q.scale = i_state_scale

        if not params.rnn_states_as_inputs:
            in_qs[names['i_state']].scale = i_state_scale

        # compute scales for perceptrons
        r_pscales = {
            k: in_qs[names["r_2_%s_w" % k]].scale * i_state_scale
            for k in ['i', 'o', 'c', 'f']
        }
        scale_qtypes = {
            "r_2_%s_q" % k: MultMulBiasScaleQType(scale=r_pscale / int_scale)
            for k, r_pscale in r_pscales.items()
        }

        i_pscales = {
            k: in_qs[names["i_2_%s_w" % k]].scale * in_scale
            for k in ['i', 'o', 'c', 'f']
        }
        # if input and i_state have different scales -> scale the inputs before sum
        # otherwise do nothing and these scales will be ignored
        scale_qtypes.update({
            "i_2_%s_q" % k: MultMulBiasScaleQType(scale=i_pscale / r_pscale)
            for (k, i_pscale
                 ), r_pscale in zip(i_pscales.items(), r_pscales.values())
        })

        if params.hard_act:
            cell_in_scale = in_qs[names['c_state']].scale / int_scale
            cell_out_scale = int2_scale / in_qs[names['c_state']].scale
            state_out_scale = int3_scale / i_state_scale
        else:
            cell_in_scale = in_qs[
                names['c_state']].scale * out_tanh_sig_scale / int_scale
            cell_out_scale = int_scale / in_qs[names['c_state']].scale
            state_out_scale = out_tanh_sig_scale / i_state_scale

        scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale)
        # TODO - Check cell clip here
        scale_qtypes['cell_out_q'] = MultMulBiasScaleQType(
            scale=cell_out_scale)
        scale_qtypes['state_out_q'] = MultMulBiasScaleQType(
            scale=state_out_scale)
        # set internal scale
        scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True)
        # set biases to output of perceptron
        for gate in ['i', 'o', 'c', 'f']:
            in_qs[names[f"{gate}_b"]].scale = r_pscales[gate]
            in_qs[names[f"{gate}_b"]].dtype = np.int32
        if params.lstm_output_c_state:
            out_qs = [o_q, in_qs[names['c_state']]]
        else:
            out_qs = [o_q]
        return QRec.scaled(
            in_qs=in_qs,
            out_qs=out_qs,
            **scale_qtypes,
        )
    def _quantize_rnn(cls, params, in_qs, stats, input_bits, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        in_out_dtype = np.uint16 if input_bits == 16 else np.uint8
        if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs):
            return None
        in_qs = deepcopy(in_qs)
        if in_qs is None:
            return None
        in_q = in_qs[0]
        opts = kwargs['opts']
        # qrecs = kwargs['qrecs']
        G = kwargs['G']
        in_edges = G.indexed_in_edges(params.name)

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')

        o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'],
                                    max_val=stats['range_out'][0]['max'],
                                    dtype=in_out_dtype,
                                    narrow_range=opts['narrow_state'])

        names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)}

        in_qs[names['i_state']] = o_q

        woff = {}

        int_num_inp = roundup(params.n_inputs, input_bits == 16)

        in_q = limit_input_precision(
            params,
            input_bits,
            in_q,
            int_num_inp,
            opts['narrow_weights'],
            opts['weight_bits'])

        woff['i_2_i_w'] = calculatate_weight_q(
            in_qs,
            in_edges,
            names['i_2_i_w'],
            in_q.zero_point[0],
            (params.n_states, params.n_inputs),
            (params.n_states, int_num_inp),
            opts['weight_bits'],
            opts['narrow_weights'])

        int_num_states = roundup(params.n_states, input_bits == 16)

        o_q = limit_input_precision(
            params,
            input_bits,
            o_q,
            int_num_states,
            opts['narrow_weights'],
            opts['weight_bits'],
            extra_correction=-1 if opts.get('narrow_state') else 0)

        woff['r_2_i_w'] = calculatate_weight_q(
            in_qs,
            in_edges,
            names['r_2_i_w'],
            o_q.zero_point[0],
            (params.n_states, params.n_states),
            (params.n_states, int_num_states),
            opts['weight_bits'],
            opts['narrow_weights'])

        i_state_scale = in_qs[names['i_state']].scale
        # rescale input * weight result to state * weight result so that they can be accumulated
        inp_before_scale = in_q.scale * in_qs[names['i_2_i_w']].scale
        state_w_scale = i_state_scale * in_qs[names['r_2_i_w']].scale

        # In 8 bit kernel input is rescaled to state scale
        # In 16 bit kernel input is scaled to LUT act input scale to avoid overflow
        # Bias zero correction is rescaled to state * state_w in both cases
        rescale = inp_before_scale / state_w_scale

        if input_bits == 8:
            i_2_s_q = MultMulBiasScaleQType(
                scale=rescale)
            # 8 bit mode biases are applied by NE16 so need to be multiplied by biases
            # and norm rounding added
            i_zp_b = woff['i_2_i_w'] * i_2_s_q.qbiases.astype(
                np.int32) + (1 << (i_2_s_q.qnorms.astype(np.int32) - 1))
            woff = woff['r_2_i_w']
        else:
            i_2_s_q = MultMulBiasScaleQType(
                scale=((in_q.scale * in_qs[names['i_2_i_w']].scale) /
                       math.pow(2, -12)))
            i_2_s_q.pre_normalization = min(
                opts['weight_bits'], np.min(i_2_s_q.qnorms))
            # in 16 bit mode biases are streamed in so zp corr already in right scale
            # and do not need norm rounding
            i_zp_b = woff['i_2_i_w']
            woff = woff['r_2_i_w']

        # hard activations are only implemented for 8 bit mode at present
        if input_bits == 8 and params.hard_act:
            act_input_scale = i_state_scale
            s_2_s_q = MultMulBiasScaleQType(
                scale=state_w_scale/i_state_scale)
            s_2_o_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
            act_output_scale = i_state_scale
            act_qtype = QType(dtype=np.int8, scale=act_input_scale,
                              narrow_range=opts.get('narrow_state'))
        else:
            act_input_scale = math.pow(2, -12)
            act_output_scale = math.pow(2, -15)
            act_qtype = None

            s_2_s_q = MultMulBiasScaleQType(
                scale=state_w_scale/act_input_scale)
            if input_bits == 16:
                s_2_s_q.pre_normalization = min(
                    opts['weight_bits'], np.min(s_2_s_q.qnorms))
            s_2_o_q = MultMulBiasScaleQType(
                scale=act_output_scale/o_q.scale)

        if input_bits == 8:
            in_qs[names['i_b']].scale = state_w_scale / s_2_s_q.qbiases
            in_qs[names['i_b']].dtype = np.int32
            in_qs[names['i_b']].offset = woff * s_2_s_q.qbiases.astype(
                np.int32) + (1 << (s_2_s_q.qnorms.astype(np.int32) - 1))
            if i_zp_b is not None:
                in_qs[names['i_b']].attr.interleaved_values = [i_zp_b]
        else:
            in_qs[names['i_b']].scale = state_w_scale
            in_qs[names['i_b']].dtype = np.int32
            in_qs[names['i_b']].offset = woff
            # Interleave input zero offset bias with state bias at generation time
            in_qs[names['i_b']].attr.interleaved_values = [i_zp_b]

        return QRec.scaled(
            in_qs=in_qs,
            out_qs=[o_q],
            s_2_s_q=s_2_s_q,
            i_2_s_q=i_2_s_q,
            s_2_o_q=s_2_o_q,
            act_qtype=act_qtype,
            scales={
                'int_scale': act_output_scale,
                'out_scale': o_q.scale,
                'act_input_scale': act_input_scale,
                'inp_after_scale': i_state_scale * in_qs[names['r_2_i_w']].scale,
                'inp_before_scale': inp_before_scale
            },
            ne16=True
        )
Esempio n. 11
0
    def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
        if not cls.cache_values(node, qrec):
            return False
        in_q = qrec.in_qs[0]
        out_q = qrec.out_qs[0]
        comment = f'in q: {in_q} out_q: {out_q}'
        if qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP_ZEROPOINT':
            bits = 8 if in_q.dtype in [np.int8, np.uint8] else 16
            if in_q.signed:
                offset = ((int(math.pow(2, bits)) + in_q.zero_point[0] -
                           out_q.zero_point[0]) %
                          int(math.pow(2, bits))).astype(out_q.dtype)
            else:
                offset = (int(math.pow(2, bits)) - in_q.zero_point[0] +
                          out_q.zero_point[0]).astype(out_q.dtype)
            contents = np.array(list(offset.tobytes()) + ([0] * 7),
                                dtype=np.uint8)
        elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP':
            # no infos needed
            return True
        elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP_SCALE':
            scale = in_q.scale / out_q.scale
            in_abs_zp = in_q.zero_point.astype(np.int32)
            out_abs_zp = out_q.zero_point.astype(np.int32)
            if out_q.bits > in_q.bits:
                zero_adjust = (np.round(-in_abs_zp * scale) +
                               out_abs_zp).astype(np.int32)
            else:
                zero_adjust = (-in_abs_zp +
                               np.round(out_abs_zp * 1 / scale)).astype(
                                   np.int32)

            zero_adjust = list(zero_adjust.tobytes())

            if len(scale) > 1:
                raise NotImplementedError(
                    'multiscale conversion not supported')
            scale = scale[0]
            if in_q.dtype_bits == 8 and out_q.dtype_bits == 16:
                # scale Q16 * Q8 OK
                scale_adjust = MultMulBiasScaleQType(scale=scale,
                                                     dtype=np.int16,
                                                     available_bits=16)
            else:
                scale_adjust = MultMulBiasScaleQType(scale=scale,
                                                     dtype=np.int8,
                                                     available_bits=8)
            qbias = list(scale_adjust.qbiases.tobytes())
            qbias = qbias + [0] * (2 - len(qbias))
            qnorm = list(scale_adjust.qnorms.tobytes())
            contents = np.array(zero_adjust + qbias + qnorm + [0],
                                dtype=np.int8)
        elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FL_FP':
            qbias = list((1 / out_q.scale).astype(np.float32).tobytes())
            zero_adjust = list((out_q.zero_point.astype(np.int32) *
                                out_q.scale).astype(np.float32).tobytes())
            contents = np.array(zero_adjust + qbias, dtype=np.int8)
        elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FL':
            qbias = list((in_q.scale).astype(np.float32).tobytes())
            zero_adjust = list((-in_q.zero_point.astype(np.int32)).astype(
                np.float32).tobytes())
            contents = np.array(zero_adjust + qbias, dtype=np.int8)
        else:
            raise ValueError(f"strange dtype change in {pnode.name}")
        cname, file_name = gen_constant(gen, pnode, pnode, INFOS)
        const_info = ConstantInfo(file_name,
                                  QType.Pow2(bits=8, q=0, signed=True),
                                  contents=contents)

        gen.globals.append(
            GlobalArgInfo("int8",
                          cname,
                          gen.opts['default_global_home_location'],
                          gen.opts['default_global_exec_location'],
                          const_info=const_info,
                          comment=comment))
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs):
            return None
        in_qs = deepcopy(in_qs)
        in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0, dtype=np.int8)
        if in_qs is None:
            return None
        opts = kwargs['opts']
        # qrecs = kwargs['qrecs']
        G = kwargs['G']

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')
        o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'],
                                    max_val=stats['range_out'][0]['max'],
                                    dtype=np.int8)

        names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)}
        # quantization_mode: extended, autotiler
        # state_width: 16bit or 8bit
        if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
            LOG.info(
                "node %s has similar input and i_state scales --> "
                "will be generated the same_scale kernel with better performances", params.name)
            params.rnn_same_inout_scale = True
            G.node_options[NodeId(params)] = params.at_options

        edges = G.indexed_in_edges(params.name)
        w_q = in_qs[names['i_2_i_w']]
        in_qs[names['i_2_i_w']] = QType.from_min_max_sq(
            w_q.min_val, w_q.max_val,
            dtype=np.int8, bits=opts['weight_bits'],
            narrow_range=opts.get('narrow_weights', True),
            dont_generate_value=True)

        w_q = in_qs[names['r_2_i_w']]
        in_qs[names['r_2_i_w']] = QType.from_min_max_sq(
            w_q.min_val, w_q.max_val,
            dtype=np.int8, bits=opts['weight_bits'],
            narrow_range=opts.get('narrow_weights', True),
            concatenated_nodes=[edges[names['i_2_i_w']].from_node.name])

        w_scales = np.maximum(
            in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale)
        if params.rnn_same_inout_scale:
            in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale)
            in_qs[0].scale = in_and_state_scale
            o_q.scale = in_and_state_scale
            if not params.rnn_states_as_inputs:
                in_qs[names['i_state']].scale = in_and_state_scale
            i_state_scale = in_and_state_scale
            i_2_a_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
        else:
            i_state_scale = in_qs[names['i_state']].scale
            i_2_a_q = MultMulBiasScaleQType(
                scale=in_qs[0].scale/i_state_scale)

        in_qs[names['i_2_i_w']].scale = w_scales
        in_qs[names['r_2_i_w']].scale = w_scales
        state_w_scale = i_state_scale * w_scales
        in_qs[names['i_b']].scale = state_w_scale
        in_qs[names['i_b']].dtype = np.int32

        if params.hard_act:
            s_2_s_q = MultMulBiasScaleQType(
                scale=state_w_scale/i_state_scale)
            s_2_o_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
            act_output_scale = math.pow(2, -7)
        else:
            act_input_scale = math.pow(2, -12)
            act_output_scale = math.pow(2, -15)
            s_2_s_q = MultMulBiasScaleQType(
                scale=state_w_scale/act_input_scale)
            s_2_o_q = MultMulBiasScaleQType(
                scale=act_output_scale/o_q.scale)
        return QRec.scaled(
            in_qs=in_qs,
            out_qs=[o_q],
            s_2_s_q=s_2_s_q,
            i_2_a_q=i_2_a_q,
            s_2_o_q=s_2_o_q,
            scales={
                'int_scale': act_output_scale,
                'out_scale': o_q.scale
            }
        )
Esempio n. 13
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        if cls.can_ne16(params, opts, fusion):
            LOG.info('selecting USQ8 NE16 kernel filter quantizer')
            return cls.quantize_ne16(params, in_qs, stats, **kwargs)
        LOG.info('selecting SQ8 software kernel filter quantizer')
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        G = kwargs['G']
        in_q = in_qs[0]

        # check input quantization and int8 type
        # if not padded we can scale asymmetric
        if in_q.dtype == np.uint8:
            # handle NE16
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            # allow asymmetric if not padded
            if isinstance(params,
                          Conv2DParameters) and params.padding.has_padding:
                in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                             stats['range_in'][0]['max'],
                                             dtype=np.int8,
                                             forced=True)
            else:
                in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                             stats['range_in'][0]['max'],
                                             dtype=np.int8,
                                             zero_point=in_q.zero_point - 128)
        elif (isinstance(params, Conv2DParameters) and not in_q.is_symmetric
              and params.padding.has_padding):
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                         stats['range_in'][0]['max'],
                                         dtype=np.int8)
        # if not forced we can try asymmetric
        elif (opts['allow_asymmetric'] and isinstance(params, Conv2DParameters)
              and not in_q.forced and in_q.is_symmetric
              and not params.padding.has_padding):
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                         stats['range_in'][0]['max'],
                                         dtype=np.int8,
                                         asymmetric=True)

        if opts['weight_bits'] != 8:
            LOG.warning(
                'sub byte weights quantization requested but NE16 kernel not selected'
            )
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        weights_q = QType.from_array_sq(
            arr=weights_node.dqvalue,
            quantized_dimension=cls.get_quantized_dimension(params, opts),
            dtype=np.int8,
            narrow_range=opts['narrow_weights'],
            bits=opts['weight_bits'])

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.int8
            if o_q.dtype != np.int8:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.is_asymmetric else "symmetric")
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype,
                                        asymmetric=opts['allow_asymmetric'])
        biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_q.scale)
        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_q, weights_q, o_q, params)

        # returning the new weights and biases qs will force backprop

        # calculate bias offset - this will be added to the bias in the kernel
        # it is already in quantized form
        biases_q.offset = FilterMult.calculate_bias_offset(
            params, in_q, weights_node, weights_q, o_q)

        if not (opts['allow_asymmetric'] or force_out_q
                or biases_q.offset is None):
            raise ValueError(
                f'bias offset is set but asymmetric is disallowed in {params.name}'
            )

        # o_q.set_forced(flags=['dtype'])
        # in_q.set_forced(flags=['dtype'])
        if isinstance(params, Conv2DParameters) and params.padding.has_padding:
            in_q.set_forced(flags=['zero_point'])

        cls.check_order(params, AT_SW_KER_IN_ORDER, AT_SW_KER_OUT_ORDER)
        return QRec.scaled(in_qs=[in_q, weights_q, biases_q],
                           out_qs=[o_q],
                           acc_q=biases_q,
                           calc_q=biases_q,
                           mul_biases_q=mul_biases_q)
    def new_load_filter_parameters(cls,
                                   G,
                                   params,
                                   filter_shape,
                                   filter_scale_axis,
                                   input_tensor,
                                   weights_node,
                                   bias_node,
                                   output_tensor,
                                   opts,
                                   dw_to_pw=False):
        weights_node.meta['filter_params'] = True
        bias_node.meta['filter_params'] = True
        # if quantizaton is not loaded then the constants will already be dequantized
        if dw_to_pw:
            # Conv has been converted from depthwise to pointwise so reorder the weights tensor
            weights_node.value = np.transpose(weights_node.value,
                                              cls.TF_LITE_DW_FILTER_TRANSPOSE)
            weights_node.dims = Dim.unnamed(weights_node.value.shape)
        if not opts.get('load_quantization'):
            return
        wqtype = weights_node.qtype
        if wqtype is None:
            LOG.warning('quantization is missing on node %s', params.name)
            return
        # scale weights as requested. change asymmetric and/or unsigned weights to signed symmetric
        if wqtype.asymmetric or not wqtype.signed:
            if opts.get('rescale_perchannel'):
                wqtype = cls.get_weights_qtype_by_channel(
                    filter_shape, filter_scale_axis, weights_node)
            else:
                wqtype = cls.get_weights_qtype_by_tensor(weights_node)
        else:
            if opts.get('rescale_perchannel'):
                if len(wqtype.scale) != filter_shape[filter_scale_axis]:
                    wqtype = cls.get_weights_qtype_by_channel(
                        filter_shape, filter_scale_axis, weights_node)
            else:
                if len(wqtype.scale) > 1:
                    wqtype = cls.get_weights_qtype_by_tensor(weights_node)

        iqtype = input_tensor.qtype
        # correct input qtype to symmetric tensor scaled
        if iqtype.asymmetric or not iqtype.signed or len(iqtype.scale) > 1:
            iqtype = QType.from_min_max_sq(min_val=iqtype.min_val,
                                           max_val=iqtype.max_val)
        else:
            iqtype = deepcopy(iqtype)

        oqtype = output_tensor.qtype
        # correct output qtype to symmetric tensor scaled
        if oqtype.asymmetric or not oqtype.signed or len(oqtype.scale) > 1:
            oqtype = QType.from_min_max_sq(min_val=oqtype.min_val,
                                           max_val=oqtype.max_val)
        else:
            oqtype = deepcopy(oqtype)

        # dqbias = bias_node.dqvalue
        bias_scale = (iqtype.scale * wqtype.scale).astype(np.float32)
        bqtype = QType(dtype=np.int32, scale=bias_scale)
        # NOTE: In some tensorflow graphs the biases are hugely negative or hugely
        # positive. I've never seen this without a relun after and the weights on
        # these channels were 0. Actually they should be pruned.
        # don't overwrite the quantized values since we may move around quantization later
        # bias_node.value = bqtype.quantize(dqbias)
        # bias_node.qtype = bqtype
        if dw_to_pw and wqtype.quantized_dimension:
            wqtype.quantized_dimension = 0

        mulbiases_q = MultMulBiasScaleQType.from_filter(
            iqtype, wqtype, oqtype, params)
        qrec = QRec.scaled(in_qs=[iqtype, wqtype, bqtype],
                           out_qs=[oqtype],
                           calc_q=bqtype,
                           acc_q=bqtype,
                           mul_biases_q=mulbiases_q)
        # now set the quantization records on the node and its constants
        G.quantization[NodeId(params)] = qrec
        G.quantization[NodeId(weights_node)] = QRec.scaled(
            out_qs=[deepcopy(wqtype)])
        G.quantization[NodeId(bias_node)] = QRec.scaled(
            out_qs=[deepcopy(bqtype)])
    def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        LOG.info('selecting USQ8 NE16 kernel filter quantizer')
        force_out_qs, _ = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        G = kwargs['G']
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        min_val, max_val = None, None
        weights_q = QType.from_array_sq(arr=weights_node.dqvalue,
                                        quantized_dimension=cls.get_quantized_dimension(
                                            params, opts),
                                        dtype=np.uint8,
                                        narrow_range=True,
                                        bit_pack=opts['weight_bits'],
                                        no_compression=True,
                                        bits=opts['weight_bits'])

        in_q = in_qs[0]
        in_q = limit_input_precision(
            params, input_bits, in_q, params.filter.sz, opts['narrow_weights'], opts['weight_bits'])

        # input dtype is either uint8 or int8
        if in_q.dtype != input_dtype:
            if in_q.forced_dtype:
                return None
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'],
                                         dtype=input_dtype,
                                         asymmetric=False)

        min_val, max_val = cls.get_min_max(
            fusion, stats, kwargs['all_stats'], params)

        if force_out_q:
            o_q = deepcopy(force_out_q)
            o_q.dont_copy_attr = ['ne16']
            LOG.warning('node %s output forced to range %s/%s - actual range %s/%s',
                        params.name, o_q.min, o_q.max, min_val, max_val)
        else:
            force_output_size = opts.get('force_output_size', 8)
            output_dtype = np.uint8 if force_output_size == 8 else np.uint16
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=output_dtype,
                                        dont_copy_attr=['ne16'],
                                        asymmetric=True)
        o_q.attr.ne16 = True
        biases_q = QType(
            dtype=np.int32, scale=weights_q.scale * in_q.scale, ne16_biases=(input_bits!=16))

        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_q, weights_q, o_q, params)

        # calculate bias offset - this will be added to the bias in the kernel
        # it is already in quantized form
        biases_q.offset = FilterMultNE16Base.calculate_bias_offset(
            params, in_q, weights_node, weights_q, o_q)
        # returning the new weights and biases qs will force backprop

        cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER)

        if input_bits == 16:
            prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8)
        else:
            prenorm = 0
        mul_biases_q.pre_normalization = prenorm

        # o_q.set_forced(flags=['dtype'])
        # in_q.set_forced(flags=['dtype'])
        return QRec.scaled(in_qs=[in_q, weights_q, biases_q],
                           out_qs=[o_q],
                           acc_q=biases_q,
                           calc_q=biases_q,
                           mul_biases_q=mul_biases_q,
                           ne16=True)
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        if force_out_qs and any(force_out_q is not None
                                for force_out_q in force_out_qs):
            return None
        in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0)
        if in_qs is None:
            return None
        in_qs = deepcopy(in_qs)
        opts = kwargs['opts']

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')
        o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'],
                                    max_val=stats['range_out'][0]['max'],
                                    dtype=out_dtype)

        names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)}
        edges = kwargs['G'].indexed_in_edges(params.name)

        for gate in ['r', 'z', 'h']:
            w_q = in_qs[names[f'w_2_{gate}_w']]
            in_qs[names[f'w_2_{gate}_w']] = QType.from_min_max_sq(
                w_q.min_val,
                w_q.max_val,
                dtype=np.int8,
                bits=opts['weight_bits'],
                narrow_range=opts.get('narrow_weights', True),
                dont_generate_value=True)
            w_q = in_qs[names[f'r_2_{gate}_w']]
            in_qs[names[f'r_2_{gate}_w']] = QType.from_min_max_sq(
                w_q.min_val,
                w_q.max_val,
                dtype=np.int8,
                bits=opts['weight_bits'],
                narrow_range=opts.get('narrow_weights', True),
                concatenated_nodes=[
                    edges[names[f'w_2_{gate}_w']].from_node.name
                ])

        if params.rnn_same_inout_scale:
            wWz_scale = rWz_scale = np.maximum(in_qs[names['w_2_z_w']].scale,
                                               in_qs[names['r_2_z_w']].scale)
            wWr_scale = rWr_scale = np.maximum(in_qs[names['w_2_r_w']].scale,
                                               in_qs[names['r_2_r_w']].scale)
            wWh_scale = rWh_scale = np.maximum(in_qs[names['w_2_h_w']].scale,
                                               in_qs[names['r_2_h_w']].scale)
            i_2_z_WR_q = i_2_r_WR_q = i_2_h_WR_q = None
            in_q = state_q = QType(bits=8, q=7, signed=True, dtype=np.int8)
            in_scale = state_scale = in_q.scale
        else:
            wWz_scale = in_qs[names['w_2_z_w']].scale
            wWr_scale = in_qs[names['w_2_r_w']].scale
            wWh_scale = in_qs[names['w_2_h_w']].scale
            rWz_scale = in_qs[names['r_2_z_w']].scale
            rWr_scale = in_qs[names['r_2_r_w']].scale
            rWh_scale = in_qs[names['r_2_h_w']].scale
            in_scale = in_qs[0].scale
            in_q = in_qs[0]
            state_q = QType(bits=8, q=7, signed=True, dtype=np.int8)
            state_scale = state_q.scale
        i_2_z_WR_q = MultMulBiasScaleQType(scale=(wWz_scale * in_scale) /
                                           (rWz_scale * state_scale))
        i_2_r_WR_q = MultMulBiasScaleQType(scale=(wWr_scale * in_scale) /
                                           (rWr_scale * state_scale))
        i_2_h_WR_q = MultMulBiasScaleQType(scale=(wWh_scale * in_scale) /
                                           (rWh_scale * state_scale))

        i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32)
        h_WR_2_int_q = MultMulBiasScaleQType(scale=(rWh_scale * state_scale) /
                                             i_qtype.scale)
        r_WR_2_int_q = MultMulBiasScaleQType(scale=(rWr_scale * state_scale) /
                                             i_qtype.scale)
        z_WR_2_int_q = MultMulBiasScaleQType(scale=(rWz_scale * state_scale) /
                                             i_qtype.scale)

        if not params.rnn_states_as_inputs:
            in_qs[names['h_state']].scale = state_q.scale
        in_qs[0].scale = in_scale
        o_q.scale = state_scale

        in_qs[names['z_b']].scale = in_scale * rWz_scale
        in_qs[names['z_b']].dtype = BIAS_DTYPE
        in_qs[names['r_b']].scale = in_scale * rWr_scale
        in_qs[names['r_b']].dtype = BIAS_DTYPE
        in_qs[names['w_h_b']].scale = in_scale * wWh_scale
        in_qs[names['w_h_b']].dtype = BIAS_DTYPE
        in_qs[names['r_h_b']].scale = in_scale * rWh_scale
        in_qs[names['r_h_b']].dtype = BIAS_DTYPE

        in_qs[names['w_2_z_w']].scale = wWz_scale
        in_qs[names['w_2_r_w']].scale = wWr_scale
        in_qs[names['w_2_h_w']].scale = wWh_scale
        in_qs[names['r_2_z_w']].scale = rWz_scale
        in_qs[names['r_2_r_w']].scale = rWr_scale
        in_qs[names['r_2_h_w']].scale = rWh_scale

        return QRec.scaled(in_qs=in_qs,
                           out_qs=[o_q],
                           i_2_z_WR_q=i_2_z_WR_q,
                           i_2_r_WR_q=i_2_r_WR_q,
                           i_2_h_WR_q=i_2_h_WR_q,
                           h_WR_2_int_q=h_WR_2_int_q,
                           r_WR_2_int_q=r_WR_2_int_q,
                           z_WR_2_int_q=z_WR_2_int_q,
                           i_qtype=i_qtype,
                           scales={
                               'w_2_z_w': wWz_scale,
                               'w_2_r_w': wWr_scale,
                               'w_2_h_w': wWh_scale,
                               'r_2_z_w': rWz_scale,
                               'r_2_r_w': rWr_scale,
                               'r_2_h_w': rWh_scale,
                               'in': [in_scale],
                               'state': state_scale,
                               'out': [state_scale]
                           })
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        if force_out_qs and any(force_out_q is not None
                                for force_out_q in force_out_qs):
            return None
        in_qs = deepcopy(in_qs)
        in_qs = cls.force_symmetric_and_dtype(in_qs, dtype=np.int16, idx=0)
        if in_qs is None:
            return None
        opts = kwargs['opts']

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')

        names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)}
        edges = kwargs['G'].indexed_in_edges(params.name)

        for gate in ['r', 'z', 'h']:
            w_q = in_qs[names[f'w_2_{gate}_w']]
            in_qs[names[f'w_2_{gate}_w']] = QType.from_min_max_sq(
                w_q.min_val,
                w_q.max_val,
                dtype=np.int8,
                bits=opts['weight_bits'],
                narrow_range=opts.get('narrow_weights', True),
                dont_generate_value=True)
            w_q = in_qs[names[f'r_2_{gate}_w']]
            in_qs[names[f'r_2_{gate}_w']] = QType.from_min_max_sq(
                w_q.min_val,
                w_q.max_val,
                dtype=np.int8,
                bits=opts['weight_bits'],
                narrow_range=opts.get('narrow_weights', True),
                concatenated_nodes=[
                    edges[names[f'w_2_{gate}_w']].from_node.name
                ])

        wWz_scale = in_qs[names['w_2_z_w']].scale
        wWr_scale = in_qs[names['w_2_r_w']].scale
        wWh_scale = in_qs[names['w_2_h_w']].scale
        rWz_scale = in_qs[names['r_2_z_w']].scale
        rWr_scale = in_qs[names['r_2_r_w']].scale
        rWh_scale = in_qs[names['r_2_h_w']].scale
        in_scale = in_qs[0].scale
        state_q_bits = 14 if opts.get('narrow_state', False) else 15
        state_q = QType(bits=16, q=state_q_bits, signed=True, dtype=np.int16)
        state_scale = state_q.scale
        i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32)
        int_scale = i_qtype.scale
        act_qtype = QType(bits=32, q=15, signed=True, dtype=np.int32)

        input_z_w_internal = MultMulBiasScaleQType(
            scale=(wWz_scale * in_scale) / int_scale)
        input_r_w_internal = MultMulBiasScaleQType(
            scale=(wWr_scale * in_scale) / int_scale)
        input_h_w_internal = MultMulBiasScaleQType(
            scale=(wWh_scale * in_scale) / int_scale)

        state_h_w_internal = MultMulBiasScaleQType(
            scale=(rWh_scale * state_scale) / int_scale)
        state_r_w_internal = MultMulBiasScaleQType(
            scale=(rWr_scale * state_scale) / int_scale)
        state_z_w_internal = MultMulBiasScaleQType(
            scale=(rWz_scale * state_scale) / int_scale)

        in_qs[names['h_state']] = state_q
        o_q = state_q

        in_qs[names['z_b']].scale = int_scale
        in_qs[names['z_b']].dtype = BIAS_DTYPE
        in_qs[names['r_b']].scale = int_scale
        in_qs[names['r_b']].dtype = BIAS_DTYPE
        in_qs[names['w_h_b']].scale = in_scale * wWh_scale
        in_qs[names['w_h_b']].dtype = BIAS_DTYPE
        in_qs[names['r_h_b']].scale = state_scale * rWh_scale
        in_qs[names['r_h_b']].dtype = BIAS_DTYPE

        return QRec.scaled(in_qs=in_qs,
                           out_qs=[o_q],
                           input_z_w_internal=input_z_w_internal,
                           input_r_w_internal=input_r_w_internal,
                           input_h_w_internal=input_h_w_internal,
                           state_h_w_internal=state_h_w_internal,
                           state_r_w_internal=state_r_w_internal,
                           state_z_w_internal=state_z_w_internal,
                           i_qtype=i_qtype,
                           act_qtype=act_qtype,
                           scales={
                               'w_2_z_w': wWz_scale,
                               'w_2_r_w': wWr_scale,
                               'w_2_h_w': wWh_scale,
                               'r_2_z_w': rWz_scale,
                               'r_2_r_w': rWr_scale,
                               'r_2_h_w': rWh_scale,
                               'in': [in_scale],
                               'state': state_scale,
                               'out': [state_scale],
                               'act': math.pow(2, -15)
                           })
Esempio n. 18
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0, dtype=np.int16)
        if in_qs is None:
            return None
        in_qs = deepcopy(in_qs)
        G = kwargs['G']
        opts = kwargs.get('opts', {})

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')

        names = {
            val: idx
            for idx, val in enumerate(LSTMParameters.INPUT_NAMES)
        }

        o_q = in_qs[names['i_state']] = QType.from_min_max_sq(
            min_val=stats['range_out'][0]['min'],
            max_val=stats['range_out'][0]['max'],
            dtype=np.int16)
        if force_out_q:
            if force_out_q.zero_point != 0:
                return None
            LOG.warning(
                'on node %s output is being forced from scale %s -> %s',
                params.name, o_q.scale, force_out_qs[0].scale)
            o_q = force_out_qs[0]

        cell_range = stats.get('range_cell')
        if cell_range is None:
            raise ValueError(
                f'cell range not present in stats for {params.name}')

        # cell range in minimum 1.0
        cell_stat = max(1.0, *[abs(cell_range[var]) for var in ['min', 'max']])
        if params.cell_clip and not params.quant_c_state_with_stat:
            cell_max = params.cell_clip
            ratio_c = cell_max / cell_stat
            if not (ratio_c > 0.9 and ratio_c < 1.1):
                msg = (
                    f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated "
                    f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} "
                    "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated"
                )
                LOG.warning('%s', msg)
        else:
            cell_max = cell_stat

        # this limit is driven by the c_in * f + c * i calculation
        # c * i will be in Q24 and we want c_in * f to be scaled to the same
        # abs(f) will be <=1 so the cell int bits cannot exceed 31 - 1 (overflow) - 24 = 6
        cell_limit = pow(2, 6)
        if cell_max > cell_limit:
            LOG.warning('Cell state exceeds %s and will be clipped',
                        cell_limit)
            cell_max = cell_limit

        cell_int_bits = calc_bits(cell_max)
        in_qs[names['c_state']] = QType.from_min_max_sq(-cell_max,
                                                        cell_max,
                                                        dtype=np.int16)

        LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max,
                  in_qs[names['c_state']].range)

        # set weight qtypes
        edges = kwargs['G'].indexed_in_edges(params.name)
        scale_pairs = {
            chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan)
            for chan in ['i', 'o', 'c', 'f']
        }
        for scale_pair in scale_pairs.values():
            in_q = in_qs[names[scale_pair[0]]]
            in_qs[names[scale_pair[0]]] = QType.from_min_max_sq(
                in_q.min_val,
                in_q.max_val,
                dtype=np.int8,
                narrow_range=opts.get('narrow_weights'),
                dont_generate_value=True)
            in_qs[names[scale_pair[0]]].bits = opts['weight_bits']
            in_q = in_qs[names[scale_pair[1]]]
            in_qs[names[scale_pair[1]]] = QType.from_min_max_sq(
                in_q.min_val,
                in_q.max_val,
                dtype=np.int8,
                narrow_range=opts.get('narrow_weights'),
                concatenated_nodes=[
                    edges[names[scale_pair[0]]].from_node.name
                ])
            in_qs[names[scale_pair[1]]].bits = opts['weight_bits']

        # get weight scales
        w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale)
                    for k, (namei, namer) in scale_pairs.items()]

        gate_sum_max = [(get_max(stats[f'range_{gate}_gate_i']),
                         get_max(stats[f'range_{gate}_gate_r']))
                        for gate in ['i', 'o', 'c', 'f']]

        gate_sum_max_bits = [
            (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))),
             np.ceil(np.log2(gsm_r / (o_q.scale * r_w))))
            for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales)
        ]

        for gate, (max_i, max_r) in zip(['i', 'o', 'c', 'f'],
                                        gate_sum_max_bits):
            if max_i > 30:
                LOG.warning(
                    'max bits in accumulation input %s gate %s - there may be errors',
                    max_i, gate)
            if max_r > 30:
                LOG.warning(
                    'max bits in accumulation state %s gate %s - there may be errors',
                    max_i, gate)

        # LUT activations Q12 -> Q15
        act_in_q = 12
        act_out_q = 15
        int_scale = math.pow(2, -act_in_q)
        out_tanh_sig_scale = math.pow(2, -act_out_q)

        scale_qtypes = {}
        r_pscales = {}
        i_pscales = {}
        scale_qtypes['r_pscales'] = r_pscales
        scale_qtypes['i_pscales'] = i_pscales
        for gate, w_scale, max_bits in zip(['i', 'o', 'c', 'f'], w_scales,
                                           gate_sum_max_bits):
            weight_scale_ratio = w_scale[0] / w_scale[1]
            # TODO - decide to scale weights equal
            in_qs[names[f"{gate}_b"]] = QType(scale=int_scale, dtype=np.int32)
            i_pscales[gate] = w_scale[0] * in_qs[0].scale
            scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                scale=i_pscales[gate] / int_scale)
            qscale.pre_normalization = int(max(8 - (31 - max_bits[0]), 0))
            r_pscales[gate] = w_scale[1] * o_q.scale
            scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                scale=r_pscales[gate] / int_scale)
            qscale.pre_normalization = int(max(8 - (31 - max_bits[1]), 0))

        r_pscales['state_out_scale'] = o_q.scale
        r_pscales['int_scale'] = int_scale

        # ct = c_in * f + c * i
        # c * i = Q15 * Q15 -> Q30 -> norm(18) -> Q12
        # scale(c_in * f) = Q15 * Q15 prenorm 8 and scale -> Q12
        # ((c_in * f) + (c * i)) in Q12
        # scale -> cell_out
        # tan(ct) -> Q15
        # o * tan(ct) -> Q30
        # prenorm and scale

        # cell in to Q12
        cell_in_scale = (in_qs[names['c_state']].scale * out_tanh_sig_scale /
                         int_scale)
        # cell_out from Q12
        cell_out_scale = int_scale / in_qs[names['c_state']].scale
        # state out from Q30
        state_out_scale = math.pow(2, -(2 * act_out_q)) / o_q.scale

        r_pscales['act_out_scale'] = out_tanh_sig_scale
        r_pscales['c_before_scale'] = int_scale

        scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale)
        # for 16 bit pre-normalize the scales to give us room
        scale_qtypes['cell_in_q'].pre_normalization = 8
        scale_qtypes['cell_out_q'] = MultMulBiasScaleQType(
            scale=cell_out_scale)
        scale_qtypes['state_out_q'] = MultMulBiasScaleQType(
            scale=state_out_scale)
        scale_qtypes['state_out_q'].pre_normalization = 8
        scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32)
        if params.lstm_output_c_state:
            out_qs = [o_q, in_qs[names['c_state']]]
        else:
            out_qs = [o_q]

        return QRec.scaled(
            in_qs=in_qs,
            out_qs=out_qs,
            **scale_qtypes,
        )
Esempio n. 19
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        if force_out_qs and any(force_out_q is not None
                                for force_out_q in force_out_qs):
            return None
        in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0)
        if in_qs is None:
            return None
        in_qs = deepcopy(in_qs)
        opts = kwargs['opts']
        # qrecs = kwargs['qrecs']
        G = kwargs['G']

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')
        o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'],
                                    max_val=stats['range_out'][0]['max'],
                                    dtype=out_dtype)
        # input_nodes = {RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
        #                for edge in G.in_edges(params.name)
        #                if isinstance(edge.from_node, ConstantInputParameters)}
        names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)}
        # quantization_mode: extended, autotiler
        # state_width: 16bit or 8bit
        if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
            LOG.info(
                "node %s has similar input and i_state scales --> "
                "will be generated the same_scale kernel with better performances",
                params.name)
            params.rnn_same_inout_scale = True
            G.node_options[NodeId(params)] = params.at_options

        for weight_name in ['i_2_i_w', 'r_2_i_w']:
            in_qs[names[weight_name]] = deepcopy(in_qs[names[weight_name]])
            in_qs[names[weight_name]].dtype = np.int8
            in_qs[names[weight_name]].bits = opts['weight_bits']

        w_scales = np.maximum(in_qs[names['i_2_i_w']].scale,
                              in_qs[names['r_2_i_w']].scale)
        if params.rnn_same_inout_scale:
            in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale)
            in_qs[0].scale = in_and_state_scale
            o_q.scale = in_and_state_scale
            if not params.rnn_states_as_inputs:
                in_qs[names['i_state']].scale = in_and_state_scale
                # cls.rescale_constant(input_nodes['i_state'], in_and_state_scale, qrecs)
            i_state_scale = in_and_state_scale
            i_2_a_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
        else:
            i_state_scale = in_qs[names['i_state']].scale
            i_2_a_q = MultMulBiasScaleQType(scale=in_qs[0].scale /
                                            i_state_scale)

        in_qs[names['i_2_i_w']].scale = w_scales
        # cls.rescale_constant(input_nodes['i_2_i_w'], w_scales, qrecs)
        in_qs[names['r_2_i_w']].scale = w_scales
        # cls.rescale_constant(input_nodes['r_2_i_w'], w_scales, qrecs)
        state_w_scale = i_state_scale * w_scales
        in_qs[names['i_b']].scale = state_w_scale
        in_qs[names['i_b']].dtype = np.int32
        # cls.rescale_constant(input_nodes['i_b'], state_w_scale, qrecs, dtype=np.int32)
        if params.hard_act:
            s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale /
                                            i_state_scale)
            s_2_o_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
        else:
            act_input_scale = math.pow(2, -12)
            act_output_scale = math.pow(2, -15)
            s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale /
                                            act_input_scale)
            s_2_o_q = MultMulBiasScaleQType(scale=act_output_scale / o_q.scale)
        return QRec.scaled(
            in_qs=in_qs,
            out_qs=[o_q],
            s_2_s_q=s_2_s_q,
            i_2_a_q=i_2_a_q,
            s_2_o_q=s_2_o_q,
        )
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs):
            return None
        in_qs = deepcopy(in_qs)
        in_qs = cls.force_symmetric_and_dtype(in_qs, dtype=np.int16, idx=0)
        if in_qs is None:
            return None
        opts = kwargs['opts']
        # qrecs = kwargs['qrecs']
        G = kwargs['G']

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')

        o_q = QType(q=15, dtype=np.int16)
        names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)}
        in_qs[names['i_state']] = o_q

        if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
            LOG.info(
                "node %s has similar input and i_state scales --> "
                "will be generated the same_scale kernel with better performances", params.name)
            params.rnn_same_inout_scale = True
            G.node_options[NodeId(params)] = params.at_options

        edges = G.indexed_in_edges(params.name)
        w_q = in_qs[names['i_2_i_w']]
        in_qs[names['i_2_i_w']] = QType.from_min_max_sq(
            w_q.min_val, w_q.max_val,
            dtype=np.int8, bits=opts['weight_bits'],
            narrow_range=opts.get('narrow_weights', True),
            dont_generate_value=True)

        w_q = in_qs[names['r_2_i_w']]
        in_qs[names['r_2_i_w']] = QType.from_min_max_sq(
            w_q.min_val, w_q.max_val,
            dtype=np.int8, bits=opts['weight_bits'],
            narrow_range=opts.get('narrow_weights', True),
            concatenated_nodes=[edges[names['i_2_i_w']].from_node.name])

        act_input_scale = math.pow(2, -12)

        i_2_a_q = MultMulBiasScaleQType(
            scale=in_qs[0].scale * in_qs[names['i_2_i_w']].scale/act_input_scale)

        in_qs[names['i_b']].scale = o_q.scale * in_qs[names['r_2_i_w']].scale
        in_qs[names['i_b']].dtype = np.int32
        # cls.rescale_constant(input_nodes['i_b'], state_w_scale, qrecs, dtype=np.int32)
        act_output_scale = math.pow(2, -15)
        s_2_s_q = MultMulBiasScaleQType(
            scale=o_q.scale * in_qs[names['r_2_i_w']].scale/act_input_scale)
        return QRec.scaled(
            in_qs=in_qs,
            out_qs=[o_q],
            s_2_s_q=s_2_s_q,
            i_2_a_q=i_2_a_q,
            scales={
                'int_scale': act_output_scale,
                'out_scale': o_q.scale
            }
        )
    def _quantize_gru(cls, params, in_qs, stats, input_bits, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        if force_out_qs and any(force_out_q is not None
                                for force_out_q in force_out_qs):
            return None

        opts = kwargs.get('opts', {})

        if input_bits == 16:
            in_out_dtype = np.uint16
        else:
            in_out_dtype = np.uint8

        if in_qs is None:
            return None
        in_qs = deepcopy(in_qs)
        G = kwargs['G']

        in_q = in_qs[0]

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')

        in_edges = G.indexed_in_edges(params.name)

        names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)}

        # output/state is always Q15 or Q7 symmetric
        o_q = in_qs[names['h_state']] = QType.from_min_max_sq(
            min_val=-1,
            max_val=1,
            dtype=in_out_dtype,
            narrow_range=opts['narrow_state'])

        # set weight qtypes
        int_num_inp = roundup(params.n_inputs, input_bits == 16)
        int_num_states = roundup(params.n_states, input_bits == 16)
        woffs = {}

        in_q = limit_input_precision(params, input_bits, in_q, int_num_inp,
                                     opts['narrow_weights'],
                                     opts['weight_bits'])

        # o_q = limit_input_precision(
        #     params,
        #     input_bits,
        #     o_q,
        #     int_num_states,
        #     opts['narrow_weights'],
        #     opts['weight_bits'],
        #     extra_correction=-1 if opts.get('narrow_state') else 0)

        for gate in ['z', 'r', 'h']:
            i_idx = names[f'w_2_{gate}_w']
            r_idx = names[f'r_2_{gate}_w']

            woffs[gate] = woff_gate = [None, None]
            woff_gate[0] = calculatate_weight_q(
                in_qs, in_edges, i_idx, in_q.zero_point[0],
                (params.n_states, params.n_inputs),
                (params.n_states, int_num_inp), opts['weight_bits'],
                opts.get('narrow_weights'))

            woff_gate[1] = calculatate_weight_q(
                in_qs, in_edges, r_idx, o_q.zero_point[0],
                (params.n_states, params.n_states),
                (params.n_states, int_num_states), opts['weight_bits'],
                opts.get('narrow_weights'))

        # get weight scales
        scale_pairs = {
            chan: ('w_2_%s_w' % chan, 'r_2_%s_w' % chan)
            for chan in ['z', 'r', 'h']
        }
        w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale)
                    for k, (namei, namer) in scale_pairs.items()]

        gate_sum_max = [(get_max_or_one(stats[f'range_{gate}_gate_inp']),
                         get_max_or_one(stats[f'range_{gate}_gate_state']))
                        for gate in ['z', 'r', 'h']]

        gate_sum_max_bits = [
            (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))),
             np.ceil(np.log2(gsm_r / (o_q.scale * r_w))))
            for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales)
        ]

        for gate, (max_i, max_r) in zip(['z', 'r', 'h'], gate_sum_max_bits):
            if np.max(max_i) > 30:
                LOG.warning(
                    'max bits in accumulation input %s gate %s - there may be errors',
                    max_i, gate)
            if np.max(max_r) > 30:
                LOG.warning(
                    'max bits in accumulation state %s gate %s - there may be errors',
                    max_i, gate)

        # LUT activations Q12 -> Q15
        act_in_q = 12
        act_out_q = 15
        int_scale = math.pow(2, -act_in_q)
        out_tanh_sig_scale = math.pow(2, -act_out_q)

        scale_qtypes = {}
        r_pscales = {}
        i_pscales = {}
        scale_qtypes['r_pscales'] = r_pscales
        scale_qtypes['i_pscales'] = i_pscales
        for gate, w_scale, max_bits in zip(['z', 'r', 'h'], w_scales,
                                           gate_sum_max_bits):
            weight_scale_ratio = w_scale[0] / w_scale[1]
            # TODO - decide to scale weights equal

            i_pscales[gate] = w_scale[0] * in_q.scale
            r_pscales[gate] = w_scale[1] * o_q.scale
            # h gate input is added manually to state in Q12
            if input_bits == 16 or gate == 'h':
                scale_qtypes[f"w_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                    scale=i_pscales[gate] / int_scale)
            else:
                scale_qtypes[f"w_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                    scale=i_pscales[gate] / r_pscales[gate])
            if input_bits == 16:
                i_zp_b = woffs[gate][0]
                if gate == "h":
                    in_qs[names['w_h_b']] = QType(
                        dtype=np.int32,
                        scale=i_pscales[gate],
                        offset=i_zp_b,
                    )
            else:
                i_zp_b = woffs[gate][0] * qscale.qbiases.astype(np.int32) + (
                    1 << (qscale.qnorms.astype(np.int32) - 1))
                if gate == "h":
                    in_qs[names['w_h_b']] = QType(
                        dtype=np.int32,
                        scale=i_pscales[gate] / qscale.qbiases,
                        offset=i_zp_b,
                    )

            scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                scale=r_pscales[gate] / int_scale)

            if gate == 'h':
                bias_name = 'r_h_b'
                interleaved_values = None
            else:
                bias_name = f'{gate}_b'
                interleaved_values = [i_zp_b]
            if input_bits == 16:
                r_zp_b = woffs[gate][1]
                in_qs[names[bias_name]] = QType(
                    dtype=np.int32,
                    scale=r_pscales[gate],
                    offset=r_zp_b,
                    interleaved_values=interleaved_values)
            else:
                r_zp_b = woffs[gate][1] * qscale.qbiases.astype(np.int32) + (
                    1 << (qscale.qnorms.astype(np.int32) - 1))
                in_qs[names[bias_name]] = QType(
                    dtype=np.int32,
                    scale=r_pscales[gate] / qscale.qbiases,
                    offset=r_zp_b,
                    interleaved_values=interleaved_values)

        # NOTE - for 16 bit pre-normalize the scales to give us room but make sure it isn't negative
        if input_bits == 16:
            gate_prenorm = min(
                np.min([
                    np.min(scale_qtypes[f"{inp}_2_{gate}_q"].qnorms)
                    for gate in ['z', 'r', 'h'] for inp in ['w', 'r']
                ]), 8)
            for gate in ['z', 'r', 'h']:
                for inp in ['w', 'r']:
                    scale_qtypes[
                        f"{inp}_2_{gate}_q"].pre_normalization = gate_prenorm
        else:
            gate_prenorm = 0

        scales = {
            'i': i_pscales,
            'r': r_pscales,
            'state': o_q.scale,
            'in': in_q.scale,
            'act_in': int_scale,
            'act_out': out_tanh_sig_scale,
            'act_in_q': act_in_q,
            'act_out_q': act_out_q
        }
        scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32)

        return QRec.scaled(
            in_qs=in_qs,
            out_qs=[o_q],
            ne16=True,
            gate_prenorm=gate_prenorm,
            scales=scales,
            **scale_qtypes,
        )
Esempio n. 22
0
    def quantize_ne16(cls, params, in_qs, stats, **kwargs):
        opts = kwargs['opts']
        force_out_qs, _ = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        fusion = kwargs.get('fusion', None)
        G = kwargs['G']
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        min_val, max_val = None, None
        # note that weights are signed since the zero point of weights is
        # calculated by NE16. The zero point needs to be removed during
        # code gen
        weights_q = QType.from_array_sq(
            arr=weights_node.dqvalue,
            quantized_dimension=cls.get_quantized_dimension(params, opts),
            dtype=np.uint8,
            ne16_order=True,
            narrow_range=True,
            bits=opts['weight_bits'])

        in_q = in_qs[0]
        # check input quantization and scale asymmetric uint8
        if in_q.dtype != np.uint8:
            # I ignore a force here which is not very clean
            # if in_q.forced_dtype:
            #     return None
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                         stats['range_in'][0]['max'],
                                         dtype=np.uint8,
                                         asymmetric=True)

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.uint8
            if o_q.dtype != np.uint8:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s',
                params.name, o_q.min, o_q.max, min_val, max_val)
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=np.uint8,
                                        asymmetric=True)
        biases_q = QType(dtype=np.int32,
                         scale=weights_q.scale * in_q.scale,
                         ne16_biases=True)

        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_q, weights_q, o_q, params)

        # calculate bias offset - this will be added to the bias in the kernel
        # it is already in quantized form
        biases_q.offset = FilterMult.calculate_bias_offset(
            params, in_q, weights_node, weights_q, o_q)
        cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER)
        # returning the new weights and biases qs will force backprop

        cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER)

        # o_q.set_forced(flags=['dtype'])
        # in_q.set_forced(flags=['dtype'])
        return QRec.scaled(in_qs=[in_q, weights_q, biases_q],
                           out_qs=[o_q],
                           acc_q=biases_q,
                           calc_q=biases_q,
                           mul_biases_q=mul_biases_q,
                           ne16=True)
    def _quantize_lstm(cls, params, in_qs, stats, input_bits, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        if force_out_qs and any(force_out_q is not None
                                for force_out_q in force_out_qs):
            return None

        opts = kwargs.get('opts', {})

        if input_bits == 16:
            in_out_dtype = np.uint16
        else:
            in_out_dtype = np.uint8

        if in_qs is None:
            return None
        in_qs = deepcopy(in_qs)
        G = kwargs['G']

        in_q = in_qs[0]

        cls.check_valid_ranges(params, stats, idx=0, dirs='out')

        in_edges = G.indexed_in_edges(params.name)

        names = {
            val: idx
            for idx, val in enumerate(LSTMParameters.INPUT_NAMES)
        }

        o_q = in_qs[names['i_state']] = QType.from_min_max_sq(
            min_val=stats['range_out'][0]['min'],
            max_val=stats['range_out'][0]['max'],
            dtype=in_out_dtype,
            narrow_range=opts['narrow_state'])

        cell_range = stats.get('range_cell')
        if cell_range is None:
            raise ValueError(
                f'cell range not present in stats for {params.name}')

        # cell range in minimum 1.0
        cell_stat = max(1.0, *[abs(cell_range[var]) for var in ['min', 'max']])

        if params.cell_clip and not params.quant_c_state_with_stat:
            cell_max = params.cell_clip
            ratio_c = cell_max / cell_stat
            if not (ratio_c > 0.9 and ratio_c < 1.1):
                msg = (
                    f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated "
                    f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} "
                    "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated"
                )
                LOG.warning('%s', msg)
        else:
            cell_max = cell_stat

        # this limit is driven by the c_in * f + c * i calculation
        # c * i will be in Q24 and we want c_in * f to be scaled to the same
        # abs(f) will be <=1 so the cell int bits cannot exceed 31 - 1 (overflow) - 24 = 6
        cell_limit = pow(2, 6)
        if cell_max > cell_limit:
            LOG.warning('Cell state exceeds %s and will be clipped',
                        cell_limit)
            cell_max = cell_limit

        cell_int_bits = calc_bits(cell_max)
        # cell stays signed since it is used in a haddamard with the int32 streamout
        # in NE16
        in_qs[names['c_state']] = QType.from_min_max_sq(
            -cell_max,
            cell_max,
            dtype=np.int16 if input_bits == 16 else np.int8)

        LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max,
                  in_qs[names['c_state']].range)

        # set weight qtypes
        int_num_inp = roundup(params.n_inputs, input_bits == 16)
        int_num_states = roundup(params.n_states, input_bits == 16)
        woffs = {}

        in_q = limit_input_precision(params, input_bits, in_q, int_num_inp,
                                     opts['narrow_weights'],
                                     opts['weight_bits'])

        o_q = limit_input_precision(
            params,
            input_bits,
            o_q,
            int_num_states,
            opts['narrow_weights'],
            opts['weight_bits'],
            extra_correction=-1 if opts.get('narrow_state') else 0)

        for gate in ['i', 'o', 'c', 'f']:
            i_idx = names[f'i_2_{gate}_w']
            r_idx = names[f'r_2_{gate}_w']

            woffs[gate] = woff_gate = [None, None]
            woff_gate[0] = calculatate_weight_q(
                in_qs, in_edges, i_idx, in_q.zero_point[0],
                (params.n_states, params.n_inputs),
                (params.n_states, int_num_inp), opts['weight_bits'],
                opts.get('narrow_weights'))

            woff_gate[1] = calculatate_weight_q(
                in_qs, in_edges, r_idx, o_q.zero_point[0],
                (params.n_states, params.n_states),
                (params.n_states, int_num_states), opts['weight_bits'],
                opts.get('narrow_weights'))

        # get weight scales
        scale_pairs = {
            chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan)
            for chan in ['i', 'o', 'c', 'f']
        }
        w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale)
                    for k, (namei, namer) in scale_pairs.items()]

        gate_sum_max = [(get_max_or_one(stats[f'range_{gate}_gate_i']),
                         get_max_or_one(stats[f'range_{gate}_gate_r']))
                        for gate in ['i', 'o', 'c', 'f']]

        gate_sum_max_bits = [
            (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))),
             np.ceil(np.log2(gsm_r / (o_q.scale * r_w))))
            for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales)
        ]

        for gate, (max_i, max_r) in zip(['i', 'o', 'c', 'f'],
                                        gate_sum_max_bits):
            if np.max(max_i) > 30:
                LOG.warning(
                    'max bits in accumulation input %s gate %s - there may be errors',
                    max_i, gate)
            if np.max(max_r) > 30:
                LOG.warning(
                    'max bits in accumulation state %s gate %s - there may be errors',
                    max_i, gate)

        # LUT activations Q12 -> Q15
        act_in_q = 12
        act_out_q = 15
        int_scale = math.pow(2, -act_in_q)
        out_tanh_sig_scale = math.pow(2, -act_out_q)

        scale_qtypes = {}
        r_pscales = {}
        i_pscales = {}
        scale_qtypes['r_pscales'] = r_pscales
        scale_qtypes['i_pscales'] = i_pscales
        for gate, w_scale, max_bits in zip(['i', 'o', 'c', 'f'], w_scales,
                                           gate_sum_max_bits):
            weight_scale_ratio = w_scale[0] / w_scale[1]
            # TODO - decide to scale weights equal

            i_pscales[gate] = w_scale[0] * in_q.scale
            r_pscales[gate] = w_scale[1] * o_q.scale
            if input_bits == 16:
                scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                    scale=i_pscales[gate] / int_scale)
            else:
                scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                    scale=i_pscales[gate] / r_pscales[gate])
            if input_bits == 16:
                i_zp_b = woffs[gate][0]
            else:
                i_zp_b = woffs[gate][0] * qscale.qbiases.astype(np.int32) + (
                    1 << (qscale.qnorms.astype(np.int32) - 1))

            scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType(
                scale=r_pscales[gate] / int_scale)
            if input_bits == 16:
                r_zp_b = woffs[gate][1]
                in_qs[names[f'{gate}_b']] = QType(dtype=np.int32,
                                                  scale=r_pscales[gate],
                                                  offset=r_zp_b,
                                                  interleaved_values=[i_zp_b])
            else:
                r_zp_b = woffs[gate][1] * qscale.qbiases.astype(np.int32) + (
                    1 << (qscale.qnorms.astype(np.int32) - 1))
                in_qs[names[f'{gate}_b']] = QType(dtype=np.int32,
                                                  scale=r_pscales[gate] /
                                                  qscale.qbiases,
                                                  offset=r_zp_b,
                                                  interleaved_values=[i_zp_b])

        # NOTE - for 16 bit pre-normalize the scales to give us room but make sure it isn't negative
        if input_bits == 16:
            gate_prenorm = min(
                np.min([
                    np.min(scale_qtypes[f"{inp}_2_{gate}_q"].qnorms)
                    for gate in ['i', 'o', 'c', 'f'] for inp in ['i', 'r']
                ]), 8)
            for gate in ['i', 'o', 'c', 'f']:
                for inp in ['i', 'r']:
                    scale_qtypes[
                        f"{inp}_2_{gate}_q"].pre_normalization = gate_prenorm
        else:
            gate_prenorm = 0

        r_pscales['state_out_scale'] = o_q.scale
        r_pscales['int_scale'] = int_scale

        # ct = c_in * f + c * i
        # c * i = Q15 * Q15 -> Q30 -> norm(18) -> Q12
        # scale(c_in * f) = Qcell * Q15 (prenorm if 16bit) and scale -> Q12
        # ((c_in * f) + (c * i)) in Q12
        # scale -> cell_out
        # tan(ct) -> Q15
        # o * tan(ct) -> Q30
        # prenorm and scale

        # scale result of c_state_1 * f_gate -> Q15
        cell_in_scale = (in_qs[names['c_state']].scale * out_tanh_sig_scale /
                         out_tanh_sig_scale)

        # cell_out from Q15 -> Q7/Q15 scaled
        cell_out_scale = out_tanh_sig_scale / in_qs[names['c_state']].scale

        state_out_scale = out_tanh_sig_scale / o_q.scale

        r_pscales['act_out_scale'] = out_tanh_sig_scale
        r_pscales['c_before_scale'] = int_scale

        scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale)
        # NOTE - for 16 bit pre-normalize the scales to give us room
        if input_bits == 16:
            scale_qtypes['cell_in_q'].pre_normalization = 8
        scale_qtypes['cell_out_q'] = MultMulBiasScaleQType(
            scale=cell_out_scale)
        scale_qtypes['state_out_q'] = MultMulBiasScaleQType(
            scale=state_out_scale)
        scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32)
        if params.lstm_output_c_state:
            out_qs = [o_q, in_qs[names['c_state']]]
        else:
            out_qs = [o_q]

        return QRec.scaled(
            in_qs=in_qs,
            out_qs=out_qs,
            ne16=True,
            gate_prenorm=gate_prenorm,
            **scale_qtypes,
        )