Esempio n. 1
0
    def load_filter_parameters(cls, G, params, input_tensors, output_tensors, opts, converted_to_conv=False):
        if opts.get('load_tensors') or opts.get('load_quantization'):
            params.weights = input_tensors[1].value
            if converted_to_conv:
                params.weights = params.weights.transpose(cls.TF_LITE_DW_FILTER_TRANSPOSE)
            if params.has_bias:
                params.biases = input_tensors[2].value

        if opts.get('load_quantization'):
            if input_tensors[0].qtype is None:
                raise NoQuantizationError("quantization not present in tflite file")
            weights_scales, biases_scales, w_mins, w_maxes = cls.fix_weights_and_biases(
                params, input_tensors, opts)
            biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=biases_scales.flatten())
            weights_q = SymmetricMultQType(
                dtype=np.int8, narrow_range=True, scale=weights_scales.flatten(), min_val=w_mins, max_val=w_maxes)
            in_q = input_tensors[0].qtype
            out_q = output_tensors[0].qtype
            mulbiases_q = MultMulBiasScaleQType.from_filter(in_q, weights_q, out_q, params)
            qrec = MultScalableFilterQuantizationRecord(in_qs=[in_q],
                                                        out_qs=[out_q],
                                                        mul_biases_q=mulbiases_q,
                                                        weights_q=weights_q,
                                                        biases_q=biases_q)
            G.quantization[NodeId(params)] = qrec
Esempio n. 2
0
    def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None):
        del G
        if out_dtype is None:
            out_dtype = dtype
        if isinstance(node, (PoolingParameters, OutputParameters)):
            o_q = in_qs[0]
        elif isinstance(node, SoftMaxParameters):
            o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15))
        else:
            o_q = SymmetricMultQType.from_min_max(min_val=astats['min'],
                                                  max_val=astats['max'],
                                                  dtype=out_dtype)

        if isinstance(node, (MatrixAddParameters, MatrixSubParameters)):
            qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q])

        elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)):
            qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])

        elif isinstance(node, ConstantInputParameters):
            qrec = MultConstantQuantizationRecord(out_qs=[o_q],
                                                  constants_are_quantized=False)

        elif isinstance(node, (FcParameters, Conv2DParameters)):
            weights_q = SymmetricMultQType.from_array(arr=node.weights,
                                                      quantized_dimension=self.get_quantized_dimension(node),
                                                      dtype=dtype, narrow_range=self._narrow_weights)
            if node.has_bias:
                biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=weights_q.scale * in_qs[0].scale)
            else:
                biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=np.array([1], dtype=np.int32))
            mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node)
            qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]],
                                                        out_qs=[o_q],
                                                        weights_q=weights_q,
                                                        biases_q=biases_q,
                                                        mul_biases_q=mul_biases_q,
                                                        constants_are_quantized=False)
            LOG.debug("filter %s qrec %s", node.name, qrec)
        else:
            qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
        return qrec
Esempio n. 3
0
    def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None):
        if out_dtype is None:
            out_dtype = dtype
        if isinstance(node, (PoolingParameters, OutputParameters, SplitParameters)):
            o_q = in_qs[0]
        elif isinstance(node, SoftMaxParameters):
            o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15))
        else:
            o_q = SymmetricMultQType.from_min_max(min_val=astats['range_out'][0]['min'],
                                                  max_val=astats['range_out'][0]['max'],
                                                  dtype=out_dtype)

        if isinstance(node, (MatrixAddParameters, MatrixSubParameters)):
            qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
        elif isinstance(node, ExpressionFusionParameters):
            o_qs = [SymmetricMultQType.from_min_max(min_val=orange['min'],
                                                    max_val=orange['max'],
                                                    dtype=out_dtype)
                    for orange in astats['range_out']]
            fusion_inputs = sorted([n for n in node.subgraph.inputs()
                                    if isinstance(n, FusionInputParameters)],
                                   key=lambda x: x.idx)
            fusion_outputs = sorted([n for n in node.subgraph.outputs()
                                     if isinstance(n, FusionOutputParameters)],
                                    key=lambda x: x.idx)

            node_scale_map = {fnode: in_qs[idx].scale
                              for idx, fnode in enumerate(fusion_inputs)}
            for idx, fnode in enumerate(fusion_outputs):
                node_scale_map[fnode] = o_qs[idx].scale
            inp, outp, expr = node.decompose(node_scale_map=node_scale_map)

            qrec = MultExpressionQuantizationRecord(in_qs=in_qs,
                                                    out_qs=o_qs,
                                                    inputs=inp,
                                                    output_exprs=outp,
                                                    intermediate_exprs=expr)
        elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)):
            qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])

        elif isinstance(node, SplitParameters):
            qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]*node.num_splits)

        elif isinstance(node, ConstantInputParameters):
            if node.value_quantization:
                qrec = MultConstantQuantizationRecord(out_qs=[node.value_quantization],
                                                      constants_are_quantized=True)
            else:
                qrec = MultConstantQuantizationRecord(out_qs=[o_q],
                                                      constants_are_quantized=False)

        elif isinstance(node, (FcParameters, Conv2DParameters)):
            weights_q = SymmetricMultQType.from_array(arr=node.weights,
                                                      quantized_dimension=self.get_quantized_dimension(
                                                          node),
                                                      dtype=dtype, narrow_range=self._narrow_weights)
            if node.has_bias:
                biases_q = SymmetricMultBiasesQType(
                    dtype=np.int32, scale=weights_q.scale * in_qs[0].scale)
            else:
                biases_q = SymmetricMultBiasesQType(
                    dtype=np.int32, scale=np.array([1], dtype=np.int32))
            mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node)
            qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]],
                                                        out_qs=[o_q],
                                                        weights_q=weights_q,
                                                        biases_q=biases_q,
                                                        mul_biases_q=mul_biases_q,
                                                        constants_are_quantized=False)
            LOG.debug("filter %s qrec %s", node.name, qrec)
        elif isinstance(node, RNNParameters):
            input_nodes = {RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
                           for edge in G.in_edges(node.name)
                           if isinstance(edge.from_node, ConstantInputParameters)}
            names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)}
            # quantization_mode: extended, autotiler
            # state_width: 16bit or 8bit
            opts = self.get_options(node)
            if opts['mode'] == "extended":
                in_w_scale = in_qs[names['i_2_i_w']].scale * in_qs[0].scale
                state_w_scale = in_qs[names['r_2_i_w']].scale
                i_2_a_q = MultMulBiasScaleQType(scale=in_w_scale/state_w_scale)
                s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale)
                s_2_o_q = MultMulBiasScaleQType(scale=1/o_q.scale)
                self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32)
                qrec = MultScalableRnnQuantizationRecord(
                    in_qs=in_qs,
                    out_qs=[o_q],
                    i_2_a_q=i_2_a_q,
                    s_2_s_q=s_2_s_q,
                    s_2_o_q=s_2_o_q
                )
            elif opts['mode'] == 'autotiler':
                in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale)
                in_and_state_w_scale = np.maximum(
                    in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale)
                in_qs[0].scale = in_and_state_scale
                o_q.scale = in_and_state_scale
                self.rescale_constant(input_nodes['i_state'], in_and_state_scale)
                self.rescale_constant(input_nodes['i_2_i_w'], in_and_state_w_scale)
                self.rescale_constant(input_nodes['r_2_i_w'], in_and_state_w_scale)
                state_w_scale = in_and_state_scale * in_and_state_w_scale
                self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32)
                s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale/in_and_state_scale)
                qrec = MultScalableRnnQuantizationRecord(
                    in_qs=in_qs,
                    out_qs=[o_q],
                    s_2_s_q=s_2_s_q,
                )
        elif isinstance(node, LSTMParameters):
            input_nodes = {LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
                           for edge in G.in_edges(node.name)
                           if isinstance(edge.from_node, ConstantInputParameters)}
            names = {val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES)}
            if node.cell_clip:
                cell_max = node.cell_clip
            else:
                cell_max = max(abs(astats['range_cell'][var]) for var in ['min', 'max'])

            cell_int_bits = calc_bits(cell_max)

            in_qs[names['c_state']].recalculate_scale(-cell_max,
                                                      cell_max)
            LOG.debug("cell bits %d max %d cell range %d",
                      cell_int_bits,
                      cell_max,
                      in_qs[names['c_state']].range)
            # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10
            # but also (internal_q * 2) + cell_bits = 32
            int_q = min((32-cell_int_bits)//2, 10)
            # in and out and state are all in the same scale
            in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale)
            in_and_state_scale = np.maximum(in_and_out_scale, in_qs[names['i_state']].scale)
            in_qs[0].scale = in_and_state_scale
            o_q.scale = in_and_state_scale
            self.rescale_constant(input_nodes['i_state'], in_and_state_scale)
            scale_pairs = {chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan)
                           for chan in ['i', 'o', 'c', 'f']}
            scales = {k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale)
                      for k, (namei, namer) in scale_pairs.items()}
            for k, (namei, namer) in scale_pairs.items():
                self.rescale_constant(input_nodes[namei], scales[k])
                self.rescale_constant(input_nodes[namer], scales[k])
            int_scale = pow(2, -int_q)
            int2_scale = pow(2, -(int_q*2))
            int3_scale = pow(2, -(int_q*3))
            # compute scales for perceptrons
            pscales = {k: scales[k] * in_and_state_scale for k in ['i', 'o', 'c', 'f']}
            scale_qtypes = {"r_2_%s_q" % k: MultMulBiasScaleQType(
                scale=pscale/int_scale) for k, pscale in pscales.items()}
            scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(
                scale=in_qs[names['c_state']].scale/int_scale)
            # TODO - Check cell clip here
            scale_qtypes['cell_out_q'] = MultMulBiasScaleQType(
                scale=int2_scale/in_qs[names['c_state']].scale)
            scale_qtypes['state_out_q'] = MultMulBiasScaleQType(scale=int3_scale/in_and_state_scale)
            # set internal scale
            scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True)
            # set biases to output of perceptron
            for k in ['i', 'o', 'c', 'f']:
                self.rescale_constant(input_nodes["%s_b" % k], pscales[k], dtype=np.int32)
            qrec = MultScalableLstmQuantizationRecord(
                in_qs=in_qs,
                out_qs=[o_q],
                **scale_qtypes,
            )
        else:
            qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
        return qrec
Esempio n. 4
0
    def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs):
        qrecs = kwargs['qrecs']
        G = kwargs['G']

        o_q = SymmetricMultQType.from_min_max(
            min_val=stats['range_out'][0]['min'],
            max_val=stats['range_out'][0]['max'],
            dtype=out_dtype)

        input_nodes = {
            GRUParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
            for edge in G.in_edges(params.name)
            if isinstance(edge.from_node, ConstantInputParameters)
        }

        names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)}

        # quantization_mode: extended, autotiler
        # state_width: 16bit or 8bit
        # if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
        #     LOG.info(
        #         "node %s has similar input and i_state scales --> "
        #         "will be generated the same_scale kernel with better performance", params.name)
        #     params.rnn_same_inout_scale = True
        #     G.node_options[NodeId(params)] = params.at_options

        if params.rnn_same_inout_scale:
            wWz_scale = rWz_scale = np.maximum(in_qs[names['w_2_z_w']].scale,
                                               in_qs[names['r_2_z_w']].scale)
            wWr_scale = rWr_scale = np.maximum(in_qs[names['w_2_r_w']].scale,
                                               in_qs[names['r_2_r_w']].scale)
            wWh_scale = rWh_scale = np.maximum(in_qs[names['w_2_h_w']].scale,
                                               in_qs[names['r_2_h_w']].scale)
            i_2_z_WR_q = i_2_r_WR_q = i_2_h_WR_q = None
            in_q = state_q = QType(bits=8, q=7, signed=True, dtype=np.int8)
            in_scale = state_scale = in_q.scale
        else:
            wWz_scale = in_qs[names['w_2_z_w']].scale
            wWr_scale = in_qs[names['w_2_r_w']].scale
            wWh_scale = in_qs[names['w_2_h_w']].scale
            rWz_scale = in_qs[names['r_2_z_w']].scale
            rWr_scale = in_qs[names['r_2_r_w']].scale
            rWh_scale = in_qs[names['r_2_h_w']].scale
            in_scale = in_qs[0].scale
            in_q = in_qs[0]
            state_q = QType(bits=8, q=7, signed=True, dtype=np.int8)
            state_scale = state_q.scale
            i_2_z_WR_q = MultMulBiasScaleQType(scale=(wWz_scale * in_scale) /
                                               (rWz_scale * state_scale))
            i_2_r_WR_q = MultMulBiasScaleQType(scale=(wWr_scale * in_scale) /
                                               (rWr_scale * state_scale))
            i_2_h_WR_q = MultMulBiasScaleQType(scale=(wWh_scale * in_scale) /
                                               (rWh_scale * state_scale))

        if params.hard_act:
            i_qtype = QType(bits=32, q=15, signed=True, dtype=np.int32)
            h_WR_2_int_q = MultMulBiasScaleQType(
                scale=(rWh_scale * state_scale) / i_qtype.scale)
            r_WR_2_int_q = MultMulBiasScaleQType(
                scale=(rWr_scale * state_scale) / i_qtype.scale)
            z_WR_2_int_q = MultMulBiasScaleQType(
                scale=(rWz_scale * state_scale) / i_qtype.scale)
        else:
            i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32)
            h_WR_2_int_q = MultMulBiasScaleQType(
                scale=(rWh_scale * state_scale) / (math.pow(2, -12)))
            r_WR_2_int_q = MultMulBiasScaleQType(
                scale=(rWr_scale * state_scale) / (math.pow(2, -12)))
            z_WR_2_int_q = MultMulBiasScaleQType(
                scale=(rWz_scale * state_scale) / (math.pow(2, -12)))

        cls.rescale_constant(input_nodes['h_state'], state_q.scale, qrecs)
        in_qs[0].scale = in_scale
        o_q.scale = state_scale

        cls.rescale_constant(input_nodes['w_z_b'],
                             in_scale * wWz_scale,
                             qrecs,
                             dtype=BIAS_DTYPE)
        cls.rescale_constant(input_nodes['w_r_b'],
                             in_scale * wWr_scale,
                             qrecs,
                             dtype=BIAS_DTYPE)
        cls.rescale_constant(input_nodes['w_h_b'],
                             in_scale * wWh_scale,
                             qrecs,
                             dtype=BIAS_DTYPE)
        cls.rescale_constant(input_nodes['r_z_b'],
                             state_scale * rWz_scale,
                             qrecs,
                             dtype=BIAS_DTYPE)
        cls.rescale_constant(input_nodes['r_r_b'],
                             state_scale * rWr_scale,
                             qrecs,
                             dtype=BIAS_DTYPE)
        cls.rescale_constant(input_nodes['r_h_b'],
                             state_scale * rWh_scale,
                             qrecs,
                             dtype=BIAS_DTYPE)

        cls.rescale_constant(input_nodes['w_2_z_w'],
                             wWz_scale,
                             qrecs,
                             dtype=WEIGHTS_DTYPE)
        cls.rescale_constant(input_nodes['w_2_r_w'],
                             wWr_scale,
                             qrecs,
                             dtype=WEIGHTS_DTYPE)
        cls.rescale_constant(input_nodes['w_2_h_w'],
                             wWh_scale,
                             qrecs,
                             dtype=WEIGHTS_DTYPE)
        cls.rescale_constant(input_nodes['r_2_z_w'],
                             rWz_scale,
                             qrecs,
                             dtype=WEIGHTS_DTYPE)
        cls.rescale_constant(input_nodes['r_2_r_w'],
                             rWr_scale,
                             qrecs,
                             dtype=WEIGHTS_DTYPE)
        cls.rescale_constant(input_nodes['r_2_h_w'],
                             rWh_scale,
                             qrecs,
                             dtype=WEIGHTS_DTYPE)

        return MultScalableGRUQuantizationRecord(in_qs=in_qs,
                                                 out_qs=[o_q],
                                                 i_2_z_WR_q=i_2_z_WR_q,
                                                 i_2_r_WR_q=i_2_r_WR_q,
                                                 i_2_h_WR_q=i_2_h_WR_q,
                                                 h_WR_2_int_q=h_WR_2_int_q,
                                                 r_WR_2_int_q=r_WR_2_int_q,
                                                 z_WR_2_int_q=z_WR_2_int_q,
                                                 i_qtype=i_qtype)
Esempio n. 5
0
    def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs):
        qrecs = kwargs['qrecs']
        G = kwargs['G']

        o_q = SymmetricMultQType.from_min_max(
            min_val=stats['range_out'][0]['min'],
            max_val=stats['range_out'][0]['max'],
            dtype=out_dtype)
        input_nodes = {
            LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
            for edge in G.in_edges(params.name)
            if isinstance(edge.from_node, ConstantInputParameters)
        }
        names = {
            val: idx
            for idx, val in enumerate(LSTMParameters.INPUT_NAMES)
        }
        if params.cell_clip:
            cell_max = params.cell_clip
        else:
            cell_max = max(
                abs(stats['range_cell'][var]) for var in ['min', 'max'])

        cell_int_bits = calc_bits(cell_max)

        in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max)
        LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max,
                  in_qs[names['c_state']].range)
        int2_scale = int3_scale = out_tanh_sig_scale = None
        if params.hard_act:
            # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10
            # but also (internal_q * 2) + cell_bits = 32
            int_q = min((16 - cell_int_bits), 10)
            int2_scale = math.pow(2, -(int_q * 2))
            int3_scale = math.pow(2, -(int_q * 3))
        else:
            int_q = 12
            out_tanh_sig_scale = math.pow(
                2, -15)  # output of LUT activations are always Q15
        int_scale = math.pow(2, -int_q)

        if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
            LOG.info(
                "node %s has similar input and i_state scales --> "
                "will be generated the same_scale kernel with better performances",
                params.name)
            params.rnn_same_inout_scale = True
            G.node_options[NodeId(params)] = params.at_options

        if params.rnn_same_inout_scale:
            if not np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
                LOG.warning(
                    "node %s has different input and i_state scales consider using the "
                    "LSTM kernel with rnn_same_inout_scale=False (better accuracy)",
                    params.name)
            # in and out and state are all in the same scale
            in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale)
            i_state_scale = in_scale = np.maximum(
                in_and_out_scale, in_qs[names['i_state']].scale)
            in_qs[0].scale = in_scale
            o_q.scale = in_scale
            cls.rescale_constant(input_nodes['i_state'], i_state_scale, qrecs)
        else:
            in_scale = in_qs[0].scale
            i_state_scale = np.maximum(o_q.scale,
                                       in_qs[names['i_state']].scale)
            o_q.scale = i_state_scale
        scale_pairs = {
            chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan)
            for chan in ['i', 'o', 'c', 'f']
        }
        scales = {
            k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale)
            for k, (namei, namer) in scale_pairs.items()
        }
        for k, (namei, namer) in scale_pairs.items():
            cls.rescale_constant(input_nodes[namei], scales[k], qrecs)
            cls.rescale_constant(input_nodes[namer], scales[k], qrecs)
        # compute scales for perceptrons
        pscales = {k: scales[k] * i_state_scale for k in ['i', 'o', 'c', 'f']}
        scale_qtypes = {
            "r_2_%s_q" % k: MultMulBiasScaleQType(scale=pscale / int_scale)
            for k, pscale in pscales.items()
        }

        # if input and i_state have different scales -> scale the inputs before sum
        # otherwise do nothing and these scales will be ignored
        scale_qtypes.update({
            "i_2_%s_q" % k:
            MultMulBiasScaleQType(scale=in_scale / i_state_scale)
            for k in ['i', 'o', 'c', 'f']
        })

        if params.hard_act:
            cell_in_scale = in_qs[names['c_state']].scale / int_scale
            cell_out_scale = int2_scale / in_qs[names['c_state']].scale
            state_out_scale = int3_scale / i_state_scale
        else:
            cell_in_scale = in_qs[
                names['c_state']].scale * out_tanh_sig_scale / int_scale
            cell_out_scale = int_scale / in_qs[names['c_state']].scale
            state_out_scale = out_tanh_sig_scale / i_state_scale

        scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale)
        # TODO - Check cell clip here
        scale_qtypes['cell_out_q'] = MultMulBiasScaleQType(
            scale=cell_out_scale)
        scale_qtypes['state_out_q'] = MultMulBiasScaleQType(
            scale=state_out_scale)
        # set internal scale
        scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True)
        # set biases to output of perceptron
        for gate in ['i', 'o', 'c', 'f']:
            cls.rescale_constant(input_nodes["%s_b" % gate],
                                 pscales[gate],
                                 qrecs,
                                 dtype=np.int32)
        return MultScalableLstmQuantizationRecord(
            in_qs=in_qs,
            out_qs=[o_q],
            **scale_qtypes,
        )
Esempio n. 6
0
 def scale_ao_q(self):
     mul_biases_q = self._info.get('scale_ao_q')
     if mul_biases_q is None:
         mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
         self.scale_ao_q = mul_biases_q
     return mul_biases_q
Esempio n. 7
0
    def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs):
        opts = kwargs['opts']
        qrecs = kwargs['qrecs']
        G = kwargs['G']

        o_q = SymmetricMultQType.from_min_max(
            min_val=stats['range_out'][0]['min'],
            max_val=stats['range_out'][0]['max'],
            dtype=out_dtype)
        input_nodes = {
            RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node
            for edge in G.in_edges(params.name)
            if isinstance(edge.from_node, ConstantInputParameters)
        }
        names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)}
        # quantization_mode: extended, autotiler
        # state_width: 16bit or 8bit
        mode = cls.get_options(params, opts)['mode']
        if mode == "extended":
            in_w_scale = in_qs[names['i_2_i_w']].scale * in_qs[0].scale
            state_w_scale = in_qs[names['r_2_i_w']].scale
            i_2_a_q = MultMulBiasScaleQType(scale=in_w_scale / state_w_scale)
            s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale)
            s_2_o_q = MultMulBiasScaleQType(scale=1 / o_q.scale)
            cls.rescale_constant(input_nodes['i_b'],
                                 state_w_scale,
                                 qrecs,
                                 dtype=np.int32)
            return MultScalableRnnQuantizationRecord(in_qs=in_qs,
                                                     out_qs=[o_q],
                                                     i_2_a_q=i_2_a_q,
                                                     s_2_s_q=s_2_s_q,
                                                     s_2_o_q=s_2_o_q)
        elif mode == 'autotiler':
            if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2):
                LOG.info(
                    "node %s has similar input and i_state scales --> "
                    "will be generated the same_scale kernel with better performances",
                    params.name)
                params.rnn_same_inout_scale = True
                G.node_options[NodeId(params)] = params.at_options

            w_scales = np.maximum(in_qs[names['i_2_i_w']].scale,
                                  in_qs[names['r_2_i_w']].scale)
            if params.rnn_same_inout_scale:
                in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale)
                in_qs[0].scale = in_and_state_scale
                o_q.scale = in_and_state_scale
                cls.rescale_constant(input_nodes['i_state'],
                                     in_and_state_scale, qrecs)
                i_state_scale = in_and_state_scale
                i_2_a_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
            else:
                i_state_scale = in_qs[names['i_state']].scale
                i_2_a_q = MultMulBiasScaleQType(scale=in_qs[0].scale /
                                                i_state_scale)

            cls.rescale_constant(input_nodes['i_2_i_w'], w_scales, qrecs)
            cls.rescale_constant(input_nodes['r_2_i_w'], w_scales, qrecs)
            state_w_scale = i_state_scale * w_scales
            cls.rescale_constant(input_nodes['i_b'],
                                 state_w_scale,
                                 qrecs,
                                 dtype=np.int32)
            if params.hard_act:
                s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale /
                                                i_state_scale)
                s_2_o_q = MultMulBiasScaleQType(scale=1.0)  # will be ignored
            else:
                act_input_scale = math.pow(2, -12)
                act_output_scale = math.pow(2, -15)
                s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale /
                                                act_input_scale)
                s_2_o_q = MultMulBiasScaleQType(scale=act_output_scale /
                                                o_q.scale)
            return MultScalableRnnQuantizationRecord(
                in_qs=in_qs,
                out_qs=[o_q],
                s_2_s_q=s_2_s_q,
                i_2_a_q=i_2_a_q,
                s_2_o_q=s_2_o_q,
            )