def load_filter_parameters(cls, G, params, input_tensors, output_tensors, opts, converted_to_conv=False): if opts.get('load_tensors') or opts.get('load_quantization'): params.weights = input_tensors[1].value if converted_to_conv: params.weights = params.weights.transpose(cls.TF_LITE_DW_FILTER_TRANSPOSE) if params.has_bias: params.biases = input_tensors[2].value if opts.get('load_quantization'): if input_tensors[0].qtype is None: raise NoQuantizationError("quantization not present in tflite file") weights_scales, biases_scales, w_mins, w_maxes = cls.fix_weights_and_biases( params, input_tensors, opts) biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=biases_scales.flatten()) weights_q = SymmetricMultQType( dtype=np.int8, narrow_range=True, scale=weights_scales.flatten(), min_val=w_mins, max_val=w_maxes) in_q = input_tensors[0].qtype out_q = output_tensors[0].qtype mulbiases_q = MultMulBiasScaleQType.from_filter(in_q, weights_q, out_q, params) qrec = MultScalableFilterQuantizationRecord(in_qs=[in_q], out_qs=[out_q], mul_biases_q=mulbiases_q, weights_q=weights_q, biases_q=biases_q) G.quantization[NodeId(params)] = qrec
def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None): del G if out_dtype is None: out_dtype = dtype if isinstance(node, (PoolingParameters, OutputParameters)): o_q = in_qs[0] elif isinstance(node, SoftMaxParameters): o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) else: o_q = SymmetricMultQType.from_min_max(min_val=astats['min'], max_val=astats['max'], dtype=out_dtype) if isinstance(node, (MatrixAddParameters, MatrixSubParameters)): qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, ConstantInputParameters): qrec = MultConstantQuantizationRecord(out_qs=[o_q], constants_are_quantized=False) elif isinstance(node, (FcParameters, Conv2DParameters)): weights_q = SymmetricMultQType.from_array(arr=node.weights, quantized_dimension=self.get_quantized_dimension(node), dtype=dtype, narrow_range=self._narrow_weights) if node.has_bias: biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) else: biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=np.array([1], dtype=np.int32)) mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node) qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]], out_qs=[o_q], weights_q=weights_q, biases_q=biases_q, mul_biases_q=mul_biases_q, constants_are_quantized=False) LOG.debug("filter %s qrec %s", node.name, qrec) else: qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) return qrec
def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None): if out_dtype is None: out_dtype = dtype if isinstance(node, (PoolingParameters, OutputParameters, SplitParameters)): o_q = in_qs[0] elif isinstance(node, SoftMaxParameters): o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) else: o_q = SymmetricMultQType.from_min_max(min_val=astats['range_out'][0]['min'], max_val=astats['range_out'][0]['max'], dtype=out_dtype) if isinstance(node, (MatrixAddParameters, MatrixSubParameters)): qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, ExpressionFusionParameters): o_qs = [SymmetricMultQType.from_min_max(min_val=orange['min'], max_val=orange['max'], dtype=out_dtype) for orange in astats['range_out']] fusion_inputs = sorted([n for n in node.subgraph.inputs() if isinstance(n, FusionInputParameters)], key=lambda x: x.idx) fusion_outputs = sorted([n for n in node.subgraph.outputs() if isinstance(n, FusionOutputParameters)], key=lambda x: x.idx) node_scale_map = {fnode: in_qs[idx].scale for idx, fnode in enumerate(fusion_inputs)} for idx, fnode in enumerate(fusion_outputs): node_scale_map[fnode] = o_qs[idx].scale inp, outp, expr = node.decompose(node_scale_map=node_scale_map) qrec = MultExpressionQuantizationRecord(in_qs=in_qs, out_qs=o_qs, inputs=inp, output_exprs=outp, intermediate_exprs=expr) elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, SplitParameters): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]*node.num_splits) elif isinstance(node, ConstantInputParameters): if node.value_quantization: qrec = MultConstantQuantizationRecord(out_qs=[node.value_quantization], constants_are_quantized=True) else: qrec = MultConstantQuantizationRecord(out_qs=[o_q], constants_are_quantized=False) elif isinstance(node, (FcParameters, Conv2DParameters)): weights_q = SymmetricMultQType.from_array(arr=node.weights, quantized_dimension=self.get_quantized_dimension( node), dtype=dtype, narrow_range=self._narrow_weights) if node.has_bias: biases_q = SymmetricMultBiasesQType( dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) else: biases_q = SymmetricMultBiasesQType( dtype=np.int32, scale=np.array([1], dtype=np.int32)) mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node) qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]], out_qs=[o_q], weights_q=weights_q, biases_q=biases_q, mul_biases_q=mul_biases_q, constants_are_quantized=False) LOG.debug("filter %s qrec %s", node.name, qrec) elif isinstance(node, RNNParameters): input_nodes = {RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(node.name) if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit opts = self.get_options(node) if opts['mode'] == "extended": in_w_scale = in_qs[names['i_2_i_w']].scale * in_qs[0].scale state_w_scale = in_qs[names['r_2_i_w']].scale i_2_a_q = MultMulBiasScaleQType(scale=in_w_scale/state_w_scale) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale) s_2_o_q = MultMulBiasScaleQType(scale=1/o_q.scale) self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32) qrec = MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], i_2_a_q=i_2_a_q, s_2_s_q=s_2_s_q, s_2_o_q=s_2_o_q ) elif opts['mode'] == 'autotiler': in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale) in_and_state_w_scale = np.maximum( in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale self.rescale_constant(input_nodes['i_state'], in_and_state_scale) self.rescale_constant(input_nodes['i_2_i_w'], in_and_state_w_scale) self.rescale_constant(input_nodes['r_2_i_w'], in_and_state_w_scale) state_w_scale = in_and_state_scale * in_and_state_w_scale self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale/in_and_state_scale) qrec = MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, ) elif isinstance(node, LSTMParameters): input_nodes = {LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(node.name) if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES)} if node.cell_clip: cell_max = node.cell_clip else: cell_max = max(abs(astats['range_cell'][var]) for var in ['min', 'max']) cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((32-cell_int_bits)//2, 10) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) in_and_state_scale = np.maximum(in_and_out_scale, in_qs[names['i_state']].scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale self.rescale_constant(input_nodes['i_state'], in_and_state_scale) scale_pairs = {chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f']} scales = {k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()} for k, (namei, namer) in scale_pairs.items(): self.rescale_constant(input_nodes[namei], scales[k]) self.rescale_constant(input_nodes[namer], scales[k]) int_scale = pow(2, -int_q) int2_scale = pow(2, -(int_q*2)) int3_scale = pow(2, -(int_q*3)) # compute scales for perceptrons pscales = {k: scales[k] * in_and_state_scale for k in ['i', 'o', 'c', 'f']} scale_qtypes = {"r_2_%s_q" % k: MultMulBiasScaleQType( scale=pscale/int_scale) for k, pscale in pscales.items()} scale_qtypes['cell_in_q'] = MultMulBiasScaleQType( scale=in_qs[names['c_state']].scale/int_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=int2_scale/in_qs[names['c_state']].scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType(scale=int3_scale/in_and_state_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for k in ['i', 'o', 'c', 'f']: self.rescale_constant(input_nodes["%s_b" % k], pscales[k], dtype=np.int32) qrec = MultScalableLstmQuantizationRecord( in_qs=in_qs, out_qs=[o_q], **scale_qtypes, ) else: qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) return qrec
def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs): qrecs = kwargs['qrecs'] G = kwargs['G'] o_q = SymmetricMultQType.from_min_max( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) input_nodes = { GRUParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(params.name) if isinstance(edge.from_node, ConstantInputParameters) } names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit # if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): # LOG.info( # "node %s has similar input and i_state scales --> " # "will be generated the same_scale kernel with better performance", params.name) # params.rnn_same_inout_scale = True # G.node_options[NodeId(params)] = params.at_options if params.rnn_same_inout_scale: wWz_scale = rWz_scale = np.maximum(in_qs[names['w_2_z_w']].scale, in_qs[names['r_2_z_w']].scale) wWr_scale = rWr_scale = np.maximum(in_qs[names['w_2_r_w']].scale, in_qs[names['r_2_r_w']].scale) wWh_scale = rWh_scale = np.maximum(in_qs[names['w_2_h_w']].scale, in_qs[names['r_2_h_w']].scale) i_2_z_WR_q = i_2_r_WR_q = i_2_h_WR_q = None in_q = state_q = QType(bits=8, q=7, signed=True, dtype=np.int8) in_scale = state_scale = in_q.scale else: wWz_scale = in_qs[names['w_2_z_w']].scale wWr_scale = in_qs[names['w_2_r_w']].scale wWh_scale = in_qs[names['w_2_h_w']].scale rWz_scale = in_qs[names['r_2_z_w']].scale rWr_scale = in_qs[names['r_2_r_w']].scale rWh_scale = in_qs[names['r_2_h_w']].scale in_scale = in_qs[0].scale in_q = in_qs[0] state_q = QType(bits=8, q=7, signed=True, dtype=np.int8) state_scale = state_q.scale i_2_z_WR_q = MultMulBiasScaleQType(scale=(wWz_scale * in_scale) / (rWz_scale * state_scale)) i_2_r_WR_q = MultMulBiasScaleQType(scale=(wWr_scale * in_scale) / (rWr_scale * state_scale)) i_2_h_WR_q = MultMulBiasScaleQType(scale=(wWh_scale * in_scale) / (rWh_scale * state_scale)) if params.hard_act: i_qtype = QType(bits=32, q=15, signed=True, dtype=np.int32) h_WR_2_int_q = MultMulBiasScaleQType( scale=(rWh_scale * state_scale) / i_qtype.scale) r_WR_2_int_q = MultMulBiasScaleQType( scale=(rWr_scale * state_scale) / i_qtype.scale) z_WR_2_int_q = MultMulBiasScaleQType( scale=(rWz_scale * state_scale) / i_qtype.scale) else: i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32) h_WR_2_int_q = MultMulBiasScaleQType( scale=(rWh_scale * state_scale) / (math.pow(2, -12))) r_WR_2_int_q = MultMulBiasScaleQType( scale=(rWr_scale * state_scale) / (math.pow(2, -12))) z_WR_2_int_q = MultMulBiasScaleQType( scale=(rWz_scale * state_scale) / (math.pow(2, -12))) cls.rescale_constant(input_nodes['h_state'], state_q.scale, qrecs) in_qs[0].scale = in_scale o_q.scale = state_scale cls.rescale_constant(input_nodes['w_z_b'], in_scale * wWz_scale, qrecs, dtype=BIAS_DTYPE) cls.rescale_constant(input_nodes['w_r_b'], in_scale * wWr_scale, qrecs, dtype=BIAS_DTYPE) cls.rescale_constant(input_nodes['w_h_b'], in_scale * wWh_scale, qrecs, dtype=BIAS_DTYPE) cls.rescale_constant(input_nodes['r_z_b'], state_scale * rWz_scale, qrecs, dtype=BIAS_DTYPE) cls.rescale_constant(input_nodes['r_r_b'], state_scale * rWr_scale, qrecs, dtype=BIAS_DTYPE) cls.rescale_constant(input_nodes['r_h_b'], state_scale * rWh_scale, qrecs, dtype=BIAS_DTYPE) cls.rescale_constant(input_nodes['w_2_z_w'], wWz_scale, qrecs, dtype=WEIGHTS_DTYPE) cls.rescale_constant(input_nodes['w_2_r_w'], wWr_scale, qrecs, dtype=WEIGHTS_DTYPE) cls.rescale_constant(input_nodes['w_2_h_w'], wWh_scale, qrecs, dtype=WEIGHTS_DTYPE) cls.rescale_constant(input_nodes['r_2_z_w'], rWz_scale, qrecs, dtype=WEIGHTS_DTYPE) cls.rescale_constant(input_nodes['r_2_r_w'], rWr_scale, qrecs, dtype=WEIGHTS_DTYPE) cls.rescale_constant(input_nodes['r_2_h_w'], rWh_scale, qrecs, dtype=WEIGHTS_DTYPE) return MultScalableGRUQuantizationRecord(in_qs=in_qs, out_qs=[o_q], i_2_z_WR_q=i_2_z_WR_q, i_2_r_WR_q=i_2_r_WR_q, i_2_h_WR_q=i_2_h_WR_q, h_WR_2_int_q=h_WR_2_int_q, r_WR_2_int_q=r_WR_2_int_q, z_WR_2_int_q=z_WR_2_int_q, i_qtype=i_qtype)
def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs): qrecs = kwargs['qrecs'] G = kwargs['G'] o_q = SymmetricMultQType.from_min_max( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) input_nodes = { LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(params.name) if isinstance(edge.from_node, ConstantInputParameters) } names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } if params.cell_clip: cell_max = params.cell_clip else: cell_max = max( abs(stats['range_cell'][var]) for var in ['min', 'max']) cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) int2_scale = int3_scale = out_tanh_sig_scale = None if params.hard_act: # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((16 - cell_int_bits), 10) int2_scale = math.pow(2, -(int_q * 2)) int3_scale = math.pow(2, -(int_q * 3)) else: int_q = 12 out_tanh_sig_scale = math.pow( 2, -15) # output of LUT activations are always Q15 int_scale = math.pow(2, -int_q) if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options if params.rnn_same_inout_scale: if not np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.warning( "node %s has different input and i_state scales consider using the " "LSTM kernel with rnn_same_inout_scale=False (better accuracy)", params.name) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) i_state_scale = in_scale = np.maximum( in_and_out_scale, in_qs[names['i_state']].scale) in_qs[0].scale = in_scale o_q.scale = in_scale cls.rescale_constant(input_nodes['i_state'], i_state_scale, qrecs) else: in_scale = in_qs[0].scale i_state_scale = np.maximum(o_q.scale, in_qs[names['i_state']].scale) o_q.scale = i_state_scale scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } scales = { k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items() } for k, (namei, namer) in scale_pairs.items(): cls.rescale_constant(input_nodes[namei], scales[k], qrecs) cls.rescale_constant(input_nodes[namer], scales[k], qrecs) # compute scales for perceptrons pscales = {k: scales[k] * i_state_scale for k in ['i', 'o', 'c', 'f']} scale_qtypes = { "r_2_%s_q" % k: MultMulBiasScaleQType(scale=pscale / int_scale) for k, pscale in pscales.items() } # if input and i_state have different scales -> scale the inputs before sum # otherwise do nothing and these scales will be ignored scale_qtypes.update({ "i_2_%s_q" % k: MultMulBiasScaleQType(scale=in_scale / i_state_scale) for k in ['i', 'o', 'c', 'f'] }) if params.hard_act: cell_in_scale = in_qs[names['c_state']].scale / int_scale cell_out_scale = int2_scale / in_qs[names['c_state']].scale state_out_scale = int3_scale / i_state_scale else: cell_in_scale = in_qs[ names['c_state']].scale * out_tanh_sig_scale / int_scale cell_out_scale = int_scale / in_qs[names['c_state']].scale state_out_scale = out_tanh_sig_scale / i_state_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for gate in ['i', 'o', 'c', 'f']: cls.rescale_constant(input_nodes["%s_b" % gate], pscales[gate], qrecs, dtype=np.int32) return MultScalableLstmQuantizationRecord( in_qs=in_qs, out_qs=[o_q], **scale_qtypes, )
def scale_ao_q(self): mul_biases_q = self._info.get('scale_ao_q') if mul_biases_q is None: mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) self.scale_ao_q = mul_biases_q return mul_biases_q
def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs): opts = kwargs['opts'] qrecs = kwargs['qrecs'] G = kwargs['G'] o_q = SymmetricMultQType.from_min_max( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) input_nodes = { RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(params.name) if isinstance(edge.from_node, ConstantInputParameters) } names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit mode = cls.get_options(params, opts)['mode'] if mode == "extended": in_w_scale = in_qs[names['i_2_i_w']].scale * in_qs[0].scale state_w_scale = in_qs[names['r_2_i_w']].scale i_2_a_q = MultMulBiasScaleQType(scale=in_w_scale / state_w_scale) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale) s_2_o_q = MultMulBiasScaleQType(scale=1 / o_q.scale) cls.rescale_constant(input_nodes['i_b'], state_w_scale, qrecs, dtype=np.int32) return MultScalableRnnQuantizationRecord(in_qs=in_qs, out_qs=[o_q], i_2_a_q=i_2_a_q, s_2_s_q=s_2_s_q, s_2_o_q=s_2_o_q) elif mode == 'autotiler': if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options w_scales = np.maximum(in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale) if params.rnn_same_inout_scale: in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale cls.rescale_constant(input_nodes['i_state'], in_and_state_scale, qrecs) i_state_scale = in_and_state_scale i_2_a_q = MultMulBiasScaleQType(scale=1.0) # will be ignored else: i_state_scale = in_qs[names['i_state']].scale i_2_a_q = MultMulBiasScaleQType(scale=in_qs[0].scale / i_state_scale) cls.rescale_constant(input_nodes['i_2_i_w'], w_scales, qrecs) cls.rescale_constant(input_nodes['r_2_i_w'], w_scales, qrecs) state_w_scale = i_state_scale * w_scales cls.rescale_constant(input_nodes['i_b'], state_w_scale, qrecs, dtype=np.int32) if params.hard_act: s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale / i_state_scale) s_2_o_q = MultMulBiasScaleQType(scale=1.0) # will be ignored else: act_input_scale = math.pow(2, -12) act_output_scale = math.pow(2, -15) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale / act_input_scale) s_2_o_q = MultMulBiasScaleQType(scale=act_output_scale / o_q.scale) return MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, i_2_a_q=i_2_a_q, s_2_o_q=s_2_o_q, )