Beispiel #1
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        G = kwargs['G']
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        min_val, max_val = None, None
        weights_q = QType.from_array_sq(
            arr=weights_node.dqvalue,
            quantized_dimension=cls.get_quantized_dimension(params, opts),
            dtype=np.int8,
            narrow_range=opts['narrow_weights'])
        if fusion and fusion.fusion_type in [
                'conv_active_pool', 'conv_active'
        ]:
            stats = kwargs['all_stats'][NodeId(fusion,
                                               fusion.contained_nodes()[0])]

            if isinstance(
                    fusion.contained_nodes()[1],
                (SigmoidActivationParameters, TanHActivationParameters,
                 HSwishActivationParameters)):
                stats = kwargs['all_stats'][NodeId(
                    fusion,
                    fusion.contained_nodes()[0])]
            elif fusion and isinstance(fusion.contained_nodes()[1],
                                       HSigmoidActivationParameters):
                # Hard sigmoid implements a RELU, be sure 6 can be representable
                min_val, max_val = 0, 6
            else:
                # Take stats from activation after the convolution
                stats = kwargs['all_stats'][NodeId(
                    fusion,
                    fusion.contained_nodes()[1])]

        if min_val is None or max_val is None:
            min_val, max_val = stats['range_out'][0]['min'], stats[
                'range_out'][0]['max']

        if force_out_q:
            o_q = force_out_q
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype)
        biases_q = QType(dtype=np.int32,
                         scale=weights_q.scale * in_qs[0].scale)
        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_qs[0], weights_q, o_q, params)
        # returning the new weights and biases qs will force backprop
        # TODO - ACC_Q LOOKS WRONG AFTER THIS
        return MultScalableFilterQuantizationRecord(
            in_qs=[in_qs[0], weights_q, biases_q],
            out_qs=[o_q],
            acc_q=biases_q,
            calc_q=biases_q,
            mul_biases_q=mul_biases_q)
Beispiel #2
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)

        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]

        G = kwargs['G']
        # only attempt channel scaling if the second input is constant
        # if len(in_qs) > 2:
        in2_node, in_qs = cls.move_constant(G, fusion if fusion else params,
                                            in_qs)
        if in2_node:
            kwargs['graph_update']['requires_adjust'] = True
            in_q2 = QType.from_array_sq(arr=in2_node.dqvalue,
                                        quantized_dimension=0,
                                        dtype=np.int8,
                                        narrow_range=True,
                                        bits=8)
        else:
            in_q2 = in_qs[1].make_symmetric_signed()

        in_q1 = in_qs[0].make_symmetric_signed()

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.int8
            if o_q.dtype != np.int8 or o_q.asymmetric:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.asymmetric else "symmetric")
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype)
        if len(in_qs) == 3:
            biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale)
            out_in_qs = [in_q1, in_q2, biases_q]
        else:
            out_in_qs = [in_q1, in_q2]

        mul_biases_q = MultMulBiasScaleQType()
        mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale

        return QRec.scaled(in_qs=out_in_qs,
                           out_qs=[o_q],
                           mul_biases_q=mul_biases_q)
Beispiel #3
0
 def _quantize(cls, params, in_qs, stats, **kwargs):
     force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
     force_out_q = force_out_qs and force_out_qs[0]
     # if forced set what we are forced to
     if force_out_q:
         o_q = deepcopy(force_out_q)
     # if value is already quantized then keep the same quantization
     elif params.qtype:
         o_q = deepcopy(params.qtype)
     # derive quantization from statistics
     else:
         o_q = QType.from_array_sq(params.value, dtype=out_dtype)
     o_q.is_constant = True
     return QRec.scaled(out_qs=[o_q])
def calculatate_weight_q(in_qs,
                         in_edges,
                         w_idx,
                         in_zero_point,
                         real_dim,
                         padded_dim,
                         qw,
                         narrow):
    # calculates weight qtype and zero offset bias correction

    wnode = in_edges[w_idx].from_node
    extra_attrs = {'bit_pack': qw} if qw < 8 else {}
    in_qs[w_idx] = QType.from_array_sq(
        wnode.dqvalue,
        dtype=np.uint8,
        bits=qw,
        narrow_range=narrow,
        quantized_dimension=0,
        resize=(
            real_dim,
            padded_dim
        ),
        ne16_decode={
            'type': 'RNN',
            'Ko': real_dim[0],
            'KiReal': real_dim[1],
            'Ki': padded_dim[1],
            'Qw': qw
        },
        no_compression=True,
        **extra_attrs)
    w_q = in_qs[w_idx]

    # since the weight zero offset is added by NE16 use signed value
    weight_val = wnode.value_as(w_q).astype(np.int32) - w_q.zero_point

    # return zero offset
    return np.sum(
        -in_zero_point.astype(np.int32) * weight_val,
        axis=1, dtype=np.int32)
Beispiel #5
0
    def quantize_ne16(cls, params, in_qs, stats, **kwargs):
        opts = kwargs['opts']
        force_out_qs, _ = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        fusion = kwargs.get('fusion', None)
        G = kwargs['G']
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        min_val, max_val = None, None
        # note that weights are signed since the zero point of weights is
        # calculated by NE16. The zero point needs to be removed during
        # code gen
        weights_q = QType.from_array_sq(
            arr=weights_node.dqvalue,
            quantized_dimension=cls.get_quantized_dimension(params, opts),
            dtype=np.uint8,
            ne16_order=True,
            narrow_range=True,
            bits=opts['weight_bits'])

        in_q = in_qs[0]
        # check input quantization and scale asymmetric uint8
        if in_q.dtype != np.uint8:
            # I ignore a force here which is not very clean
            # if in_q.forced_dtype:
            #     return None
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                         stats['range_in'][0]['max'],
                                         dtype=np.uint8,
                                         asymmetric=True)

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.uint8
            if o_q.dtype != np.uint8:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s',
                params.name, o_q.min, o_q.max, min_val, max_val)
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=np.uint8,
                                        asymmetric=True)
        biases_q = QType(dtype=np.int32,
                         scale=weights_q.scale * in_q.scale,
                         ne16_biases=True)

        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_q, weights_q, o_q, params)

        # calculate bias offset - this will be added to the bias in the kernel
        # it is already in quantized form
        biases_q.offset = FilterMult.calculate_bias_offset(
            params, in_q, weights_node, weights_q, o_q)
        cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER)
        # returning the new weights and biases qs will force backprop

        cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER)

        # o_q.set_forced(flags=['dtype'])
        # in_q.set_forced(flags=['dtype'])
        return QRec.scaled(in_qs=[in_q, weights_q, biases_q],
                           out_qs=[o_q],
                           acc_q=biases_q,
                           calc_q=biases_q,
                           mul_biases_q=mul_biases_q,
                           ne16=True)
Beispiel #6
0
    def _quantize(cls, params, in_qs, stats, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        if cls.can_ne16(params, opts, fusion):
            LOG.info('selecting USQ8 NE16 kernel filter quantizer')
            return cls.quantize_ne16(params, in_qs, stats, **kwargs)
        LOG.info('selecting SQ8 software kernel filter quantizer')
        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        G = kwargs['G']
        in_q = in_qs[0]

        # check input quantization and int8 type
        # if not padded we can scale asymmetric
        if in_q.dtype == np.uint8:
            # handle NE16
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            # allow asymmetric if not padded
            if isinstance(params,
                          Conv2DParameters) and params.padding.has_padding:
                in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                             stats['range_in'][0]['max'],
                                             dtype=np.int8,
                                             forced=True)
            else:
                in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                             stats['range_in'][0]['max'],
                                             dtype=np.int8,
                                             zero_point=in_q.zero_point - 128)
        elif (isinstance(params, Conv2DParameters) and not in_q.is_symmetric
              and params.padding.has_padding):
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                         stats['range_in'][0]['max'],
                                         dtype=np.int8)
        # if not forced we can try asymmetric
        elif (opts['allow_asymmetric'] and isinstance(params, Conv2DParameters)
              and not in_q.forced and in_q.is_symmetric
              and not params.padding.has_padding):
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'],
                                         stats['range_in'][0]['max'],
                                         dtype=np.int8,
                                         asymmetric=True)

        if opts['weight_bits'] != 8:
            LOG.warning(
                'sub byte weights quantization requested but NE16 kernel not selected'
            )
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        weights_q = QType.from_array_sq(
            arr=weights_node.dqvalue,
            quantized_dimension=cls.get_quantized_dimension(params, opts),
            dtype=np.int8,
            narrow_range=opts['narrow_weights'],
            bits=opts['weight_bits'])

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            # can't be forced to something not np.int8
            if o_q.dtype != np.int8:
                return None
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.is_asymmetric else "symmetric")
        else:
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=out_dtype,
                                        asymmetric=opts['allow_asymmetric'])
        biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_q.scale)
        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_q, weights_q, o_q, params)

        # returning the new weights and biases qs will force backprop

        # calculate bias offset - this will be added to the bias in the kernel
        # it is already in quantized form
        biases_q.offset = FilterMult.calculate_bias_offset(
            params, in_q, weights_node, weights_q, o_q)

        if not (opts['allow_asymmetric'] or force_out_q
                or biases_q.offset is None):
            raise ValueError(
                f'bias offset is set but asymmetric is disallowed in {params.name}'
            )

        # o_q.set_forced(flags=['dtype'])
        # in_q.set_forced(flags=['dtype'])
        if isinstance(params, Conv2DParameters) and params.padding.has_padding:
            in_q.set_forced(flags=['zero_point'])

        cls.check_order(params, AT_SW_KER_IN_ORDER, AT_SW_KER_OUT_ORDER)
        return QRec.scaled(in_qs=[in_q, weights_q, biases_q],
                           out_qs=[o_q],
                           acc_q=biases_q,
                           calc_q=biases_q,
                           mul_biases_q=mul_biases_q)
Beispiel #7
0
    def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8

        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]

        G = kwargs['G']
        # only attempt channel scaling if we have a bias
        in2_node, in_qs = cls.move_constant(G, fusion if fusion else params,
                                            in_qs)
        if not in2_node:
            raise ValueError(
                f"Not supported in NE16 this matmul {params.name}")

        w1, h1 = params.in_dims[0].shape[0], params.in_dims[0].shape[1]
        h2, w2 = params.in_dims[1].shape[0], params.in_dims[1].shape[1]
        h2_padded = roundup(h2, input_bits == 16)
        kwargs['graph_update']['requires_adjust'] = True
        in_q2 = QType.from_array_sq(arr=in2_node.dqvalue,
                                    quantized_dimension=0,
                                    dtype=np.uint8,
                                    narrow_range=True,
                                    bit_pack=opts['weight_bits'],
                                    no_compression=True,
                                    bits=opts['weight_bits'],
                                    resize=((h2, w2), (h2_padded, w2)))

        in_q1 = QType.from_min_max_sq(in_qs[0].min_val,
                                      in_qs[0].max_val,
                                      dtype=input_dtype,
                                      asymmetric=True)
        in_q1 = limit_input_precision(params, input_bits, in_q1, w1, False,
                                      opts['weight_bits'])

        min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'],
                                           params)

        if force_out_q:
            o_q = force_out_q
            LOG.warning(
                'node %s output forced to range %s/%s - actual range %s/%s %s',
                params.name, o_q.min, o_q.max, min_val, max_val,
                "asymmetric" if o_q.asymmetric else "symmetric")
        else:
            force_output_size = opts.get('force_output_size', 8)
            out_dtype = np.uint8 if force_output_size == 8 else np.uint16
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dont_copy_attr=['ne16'],
                                        asymmetric=True,
                                        dtype=out_dtype)
        if len(in_qs) == 3:
            biases_q = QType(dtype=np.int32,
                             scale=in_q1.scale * in_q2.scale,
                             ne16_biases=(input_bits != 16))
            # calculate bias offset - this will be added to the bias in the kernel
            # it is already in quantized form
            bias_offset = np.zeros((in2_node.dqvalue.shape[0], ),
                                   dtype=np.int32)
            if in_q1.zero_point != 0:
                # input zero correction is sum(W * Zin) by out_c if weights are channel scaled
                bias_offset -= np.sum(
                    np.multiply(in_q1.zero_point,
                                in2_node.value_as(in_q2).astype(np.int32) -
                                in_q2.zero_point,
                                dtype=np.int32),
                    dtype=np.int32,
                    axis=1)
            if o_q.zero_point != 0:
                # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled
                scale = o_q.scale / (in_q1.scale * in_q2.scale)
                bias_offset += np.floor((o_q.zero_point * scale) + 0.5).astype(
                    np.int32)
            if not np.all(bias_offset == 0):
                biases_q.offset = bias_offset
            out_in_qs = [in_q1, in_q2, biases_q]
        else:
            out_in_qs = [in_q1, in_q2]

        mul_biases_q = MultMulBiasScaleQType()
        mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale
        o_q.attr.ne16 = True

        if input_bits == 16:
            prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8)
        else:
            prenorm = 0
        mul_biases_q.pre_normalization = prenorm

        return QRec.scaled(in_qs=out_in_qs,
                           out_qs=[o_q],
                           mul_biases_q=mul_biases_q,
                           ne16=True)
    def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
        # copy in_qs because we may modify it
        in_qs = in_qs.copy()
        input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8
        opts = kwargs['opts']
        fusion = kwargs.get('fusion', None)
        LOG.info('selecting USQ8 NE16 kernel filter quantizer')
        force_out_qs, _ = cls.get_mult_opts(**kwargs)
        force_out_q = force_out_qs and force_out_qs[0]
        G = kwargs['G']
        weights_node = cls.get_weights_node(G, fusion if fusion else params)
        min_val, max_val = None, None
        weights_q = QType.from_array_sq(arr=weights_node.dqvalue,
                                        quantized_dimension=cls.get_quantized_dimension(
                                            params, opts),
                                        dtype=np.uint8,
                                        narrow_range=True,
                                        bit_pack=opts['weight_bits'],
                                        no_compression=True,
                                        bits=opts['weight_bits'])

        in_q = in_qs[0]
        in_q = limit_input_precision(
            params, input_bits, in_q, params.filter.sz, opts['narrow_weights'], opts['weight_bits'])

        # input dtype is either uint8 or int8
        if in_q.dtype != input_dtype:
            if in_q.forced_dtype:
                return None
            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
            in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'],
                                         dtype=input_dtype,
                                         asymmetric=False)

        min_val, max_val = cls.get_min_max(
            fusion, stats, kwargs['all_stats'], params)

        if force_out_q:
            o_q = deepcopy(force_out_q)
            o_q.dont_copy_attr = ['ne16']
            LOG.warning('node %s output forced to range %s/%s - actual range %s/%s',
                        params.name, o_q.min, o_q.max, min_val, max_val)
        else:
            force_output_size = opts.get('force_output_size', 8)
            output_dtype = np.uint8 if force_output_size == 8 else np.uint16
            o_q = QType.from_min_max_sq(min_val=min_val,
                                        max_val=max_val,
                                        dtype=output_dtype,
                                        dont_copy_attr=['ne16'],
                                        asymmetric=True)
        o_q.attr.ne16 = True
        biases_q = QType(
            dtype=np.int32, scale=weights_q.scale * in_q.scale, ne16_biases=(input_bits!=16))

        mul_biases_q = MultMulBiasScaleQType.from_filter(
            in_q, weights_q, o_q, params)

        # calculate bias offset - this will be added to the bias in the kernel
        # it is already in quantized form
        biases_q.offset = FilterMultNE16Base.calculate_bias_offset(
            params, in_q, weights_node, weights_q, o_q)
        # returning the new weights and biases qs will force backprop

        cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER)

        if input_bits == 16:
            prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8)
        else:
            prenorm = 0
        mul_biases_q.pre_normalization = prenorm

        # o_q.set_forced(flags=['dtype'])
        # in_q.set_forced(flags=['dtype'])
        return QRec.scaled(in_qs=[in_q, weights_q, biases_q],
                           out_qs=[o_q],
                           acc_q=biases_q,
                           calc_q=biases_q,
                           mul_biases_q=mul_biases_q,
                           ne16=True)