def execute(cls, params, in_tensors, qrec: QRec, **kwargs): in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") func = PIECEWISE_OPS[params.__class__] op = func['op'] if func['is_mult']: i1 = in_tensors[0].astype(np.int32) i2 = in_tensors[1].astype(np.int32) res = op(i1, i2, np.int32) q_calc = QType.Pow2( bits=32, q=qrec.in_qs[0].q+qrec.in_qs[1].q, signed=True) res = qrec.out_qs[0].reduce_from(res, q_calc) else: off_in = abs(qrec.in_qs[0].q - qrec.in_qs[1].q) if qrec.in_qs[0].q > qrec.in_qs[1].q: i1 = at_norm(in_tensors[0].astype(np.int32), off_in) i2 = in_tensors[1].astype(np.int32) else: i1 = in_tensors[0].astype(np.int32) i2 = at_norm(in_tensors[1].astype(np.int32), off_in) res = op(i1, i2, None) q_calc = QType.Pow2(bits=32, q=min(qrec.in_qs[0].q, qrec.in_qs[1].q), signed=True) res = qrec.out_qs[0].reduce_from(res, q_calc) return qrec.get_outputs(params, [res], ktype="symmetric")
def execute(cls, params, in_tensors, qrec: QRec, **kwargs): qname = kwargs['qname'] in_tensors = qrec.prepare_inputs(params, in_tensors, ktype=qname) if qrec: in_q = qrec.in_qs[0] out_q = qrec.out_qs[0] float_conversion = in_q.is_floating or out_q.is_floating bit_conversion = in_q.bits != out_q.bits if not float_conversion: same_sign = in_q.signed == out_q.signed if in_q.bits > out_q.bits: bit_diff = in_q.bits - out_q.bits same_scale = np.allclose(in_q.scale * np.power(2, bit_diff), out_q.scale, atol=0.0001) same_zeropoint = np.all( in_q.zero_point >> bit_diff == out_q.zero_point) elif out_q.bits > in_q.bits: bit_diff = out_q.bits - in_q.bits same_scale = np.allclose(out_q.scale * np.power(2, bit_diff), in_q.scale, atol=0.0001) same_zeropoint = np.all( in_q.zero_point == out_q.zero_point >> bit_diff) else: same_scale = np.allclose(out_q.scale, in_q.scale, atol=0.0001) same_zeropoint = np.all( in_q.zero_point == out_q.zero_point) if same_scale and same_sign and bit_conversion and same_zeropoint: if in_q.bits > out_q.bits: if in_q.signed: out_tensor = out_q.clip( at_norm(in_tensors[0].astype(np.int32), in_q.bits - out_q.bits)) else: out_tensor = out_q.clip( at_norm(in_tensors[0].astype(np.uint32), in_q.bits - out_q.bits)) else: out_tensor = in_tensors[0].astype( out_q.dtype) << (out_q.bits - in_q.bits) return qrec.get_outputs(params, [out_tensor], ktype=qname) # in all other conversions should be numerically equivalent to this (within 1 bit) out_tensor = qrec.out_qs[0].quantize_from(in_tensors[0], qrec.in_qs[0]) else: out_tensor = in_tensors[0] return qrec.get_outputs(params, [out_tensor], ktype=qname)
def execute(cls, params, in_tensors, qrec: QRec, **kwargs): in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] in_q = qrec.in_qs[0] out_q = qrec.out_qs[0] in_tensor = in_tensor.astype(np.int32) if in_q.q < 12: in_tensor <<= 12 - in_q.q elif in_q.q > 12: in_tensor = at_norm(in_tensor, in_q.q - 12) out_tensor = tanh_lut(in_tensor) if out_q.q < 15: out_tensor = at_norm(out_tensor, 15 - out_q.q) return qrec.get_outputs(params, [out_tensor], ktype="symmetric")
def av_global_pool(params, in_tensors, qrec: QuantizationRecordBase, details=None): if isinstance(qrec, MultQuantizationRecord): return av_global_pool_mult(params, in_tensors, qrec, details=details) # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] in_dims = params.in_dims[0] out_dims = params.out_dims[0] sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=(in_dims.get_order_idx('w'), in_dims.get_order_idx('h'))) norm = (np.array([31], dtype=np.int32) - gap_clb(sum_by_chan)).astype( np.int32) inv_wh = (1 << norm) // (in_dims.h * in_dims.w) out_tensor = at_norm((inv_wh * sum_by_chan), norm) return qrec.get_outputs( params, [qrec.out_qs[0].clip(out_tensor).reshape(out_dims.shape)], ktype="symmetric")
def apply_multiplicative_bias(self, params: Conv2DParameters, input_tensor: np.ndarray, axis: int, ktype: str = None): if ktype == "symmetric": if params.has_mul_bias: mul_biases = self.quantize_as(params.mul_biases, 'mul_biases_q') shape = [ params.filter.out_c if idx == axis else 1 for idx in range(3) ] input_tensor *= mul_biases.reshape(shape) input_tensor = at_norm(input_tensor, self.mul_biases_q.q) return input_tensor if ktype == "float32": if params.has_mul_bias: shape = [ params.filter.out_c if idx == axis else 1 for idx in range(3) ] input_tensor *= params.mul_biases.reshape(shape) return input_tensor raise NotImplementedError()
def postprocess(img_in, h, w, c, **kwargs): if kwargs.get('transpose'): if c == 1: img_in = img_in.transpose((1, 0)).reshape((c, h, w)) else: img_in = img_in.transpose((2, 0, 1)).copy() elif c == 1: img_in = img_in.reshape((c, w, h)) divisor = kwargs.get('divisor') or 1 offset = kwargs.get('offset') or 0 shift = kwargs.get('shift') or 0 if shift: if shift < 0: img_in = at_norm(img_in, int(-shift)) else: img_in = img_in << int(shift) img_in = np.array(img_in) norm_func = kwargs.get('norm_func') if norm_func: g_env = {}.update(np.__dict__) # pylint: disable=eval-used compiled_norm_func = eval('lambda ' + norm_func, g_env) img_in = compiled_norm_func(img_in) img_in = np.array(img_in, dtype=np.float) else: img_in = (img_in.astype(np.float) / divisor) + offset return img_in
def apply_multiplicative_bias(qrec, params: FilterParameters, input_tensor: np.ndarray, axis: int, ktype: str = None): if ktype == 'float': if hasattr(params, 'has_mul_bias') and params.has_mul_bias: shape = [ params.filter.out_c if idx == axis else 1 for idx in range(3) ] input_tensor *= params.mul_biases.reshape(shape) return input_tensor if ktype == 'symmetric' and qrec.ktype.startswith('scaled'): mul_biases_q = qrec.cache.get('mul_biases_q') if isinstance(mul_biases_q, MultMulBiasScaleQType): input_tensor = mul_biases_q.apply_scales(input_tensor, axis) elif ktype == 'symmetric' and qrec.ktype.startswith('symmetric'): if params.has_mul_bias: mul_biases_q = qrec.cache.get('mul_biases_q') mul_biases = mul_biases_q.quantize(params.mul_biases) shape = [ params.filter.out_c if idx == axis else 1 for idx in range(3) ] input_tensor *= mul_biases.reshape(shape) input_tensor = at_norm(input_tensor, mul_biases_q.q) return input_tensor.astype(np.int32)
def apply_scales(self, arr: np.ndarray, axis: int = None): if self.pre_normalization > 0: arr = at_norm(arr, self.pre_normalization) if not self.has_scale: return arr return apply_scales(self.qbiases, self.qnorms, arr, axis=axis, calc_dtype=self._calc_dtype)
def apply_scales(self, arr: np.ndarray, axis: int = None): if self.pre_normalization > 0: arr = at_norm(arr, self.pre_normalization) if not self.has_scale: return arr if axis is None: mul_biases = self.qbiases mul_biases_norm = self.qnorms assert len(mul_biases) == 1 and len( mul_biases_norm) == 1, "no axis set. should have single scale" else: shape = [ len(self.qbiases) if idx == axis else 1 for idx in range(len(arr.shape)) ] mul_biases = self.qbiases.reshape(shape) mul_biases_norm = self.qnorms.reshape(shape) return at_norm(np.multiply(arr, mul_biases, dtype=np.int32), mul_biases_norm)
def execute(cls, params, in_tensors, qrec: QuantizationRecordBase, **kwargs): in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") func = PIECEWISE_OPS[params.__class__] op = func['op'] if func['is_mult']: i1 = in_tensors[0].astype(np.int32) i2 = in_tensors[1].astype(np.int32) res = op(i1, i2, np.int32) else: off_in = abs(qrec.in_qs[0].q - qrec.in_qs[1].q) if qrec.in_qs[0].q > qrec.in_qs[1].q: i1 = at_norm(in_tensors[0].astype(np.int32), off_in) i2 = in_tensors[1].astype(np.int32) else: i1 = in_tensors[0].astype(np.int32) i2 = at_norm(in_tensors[1].astype(np.int32), off_in) res = op(i1, i2, None) return qrec.get_outputs(params, [res], ktype="symmetric")
def log_step(cls, params, in_data, mel_coeff_q, shift_buff, fft_out_q, shift, norm): if params.log_offset: raise NotImplementedError() # if params.magsquared: # qformat = mel_coeff_q - 2 - shift_buff + 2*fft_out_q + 2*shift # else: qformat = 30 - shift_buff if params.log_type == "db": return np.clip( at_norm( 10 * ((logn_17_15(in_data, True) * LN_10_INV_Q10 >> 10) - (qformat - 15) * LOG10_2), norm), -(1 << 15), (1 << 15) - 1).astype(np.int16) return np.clip( at_norm( logn_17_15(in_data, True) - (qformat - 15) * LN_2_1F15, norm), -(1 << 15), (1 << 15) - 1).astype(np.int16)
def execute(cls, params, in_tensors, qrec: QRec, **kwargs): fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0) swap_table = in_tensors[3] rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0) mel_filterbank_sparsity_mat = in_tensors[5] mel_filterbank_coeff = in_tensors[6] if params.n_dct: dct_matrix = in_tensors[7] result = [] for frame_idx in range(params.n_frames): in_data = in_tensors[0][params.frame_step * frame_idx:params.frame_step * frame_idx + params.frame_size] in_data, shift = cls.preemphasis(params, in_data, 12 if params.is_radix4() else 13) if params.win_fn: win_lut = in_tensors[1] in_data = cls.windowing(params, in_data, win_lut, qrec.in_qs[1].q) in_cfft = np.stack([in_data[::2], in_data[1::2]], axis=0) out_cfft = cls.fft_step(params, in_cfft, fft_twiddles, swap_table) out_data = RFFT_Step_Fix16(out_cfft, rfft_twiddles, params.n_fft) out_data = out_data[0] + 1j * out_data[1] spectrogram = cls.spectrogram_step(params, out_data, shift, qrec.cache['fft_out_q'].q) melspect, shift_buff = cls.melspectrogram_step( params, spectrogram, mel_filterbank_sparsity_mat, mel_filterbank_coeff, qrec.in_qs[6].q) if params.mel_type == "melspectrogram": result.append( cls.norm_clip_32_melspect(params, melspect, shift_buff)) continue logmelspect = cls.log_step(params, melspect, qrec.in_qs[6].q, shift_buff, qrec.cache["fft_out_q"], shift, params.quant_norm) if params.mel_type == "logmelspectrogram": result.append(logmelspect) continue if params.n_dct: mfcc = np.clip(at_norm(np.dot(dct_matrix, logmelspect), 14), -(1 << 15), (1 << 15) - 1) result.append(mfcc) return [np.array(result)]
def average_execute(cls, params, in_tensors, qrec: QuantizationRecordBase): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] in_dims = params.in_dims[0] out_dims = params.out_dims[0] filter_sz = params.filter.h * params.filter.w pool_factor = (1 << 16) // filter_sz out_tensor = np.zeros(out_dims.shape, dtype=np.int32) if params.padding.h + params.padding.w > 0: in_tensor = np.pad(in_tensor, params.padding.numpy_pad_shape(in_dims), mode='constant', constant_values=qrec.in_qs[0].pad_zero_point) pad_w = params.padding.w pad_h = params.padding.h else: pad_w = pad_h = 0 out_h = 0 for h_idx in range(0, in_dims.h - params.filter.h + pad_h + 1, params.stride.h): out_w = 0 for w_idx in range(0, in_dims.w - params.filter.w + pad_w + 1, params.stride.w): # accumulate - potentially with different Q out_slice_args = out_dims.srange(h=out_h, w=out_w) in_slice_args = in_dims.srange( c=[0, out_dims.c, 1], h=[h_idx, h_idx + params.filter.h, 1], w=[w_idx, w_idx + params.filter.w, 1]) res_shape = out_tensor[out_slice_args].shape sum_filter = np.sum( in_tensor[in_slice_args], dtype=qrec.dtype(ktype="float32"), axis=(out_dims.keys.index('h'), out_dims.keys.index('w'))).reshape(res_shape) sum_filter = np.multiply(sum_filter, pool_factor) out_tensor[out_slice_args] = sum_filter out_w += 1 out_h += 1 return qrec.get_outputs(params, [ qrec.out_qs[0].clip(at_norm(out_tensor, 16), qrec.out_qs[0].dtype) ], ktype="symmetric")
def piecewise(params, in_tensors, qrec: QuantizationRecordBase, details=None): if isinstance(qrec, (MultQuantizationRecord, MultAddQuantizationRecord)): return piecewise_mult(params, in_tensors, qrec, details=details) in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") func = PIECEWISE_OPS[params.__class__] op = func['op'] if func['is_mult']: i1 = in_tensors[0].astype(np.int32) i2 = in_tensors[1].astype(np.int32) res = op(i1, i2, np.int32) else: off_in = abs(qrec.in_qs[0].q - qrec.in_qs[1].q) if qrec.in_qs[0].q > qrec.in_qs[1].q: i1 = at_norm(in_tensors[0].astype(np.int32), off_in) i2 = in_tensors[1].astype(np.int32) else: i1 = in_tensors[0].astype(np.int32) i2 = at_norm(in_tensors[1].astype(np.int32), off_in) res = op(i1, i2, None) return qrec.get_outputs(params, [res], ktype="symmetric")
def execute(cls, params, in_tensors, qrec: QuantizationRecordBase, **kwargs): in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] qrec.set_scale() neg_in = at_norm(in_tensor * leak_mult_gen_factor_q7(params), 7) in_tensor = in_tensor * (in_tensor > 0) + neg_in * (in_tensor < 0) in_tensor = qrec.scale_mul_biases_q.apply_scales(in_tensor) if qrec.out_qs[0] != qrec.in_qs[0]: return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric") return qrec.get_outputs(params, [in_tensor], ktype="symmetric")
def execute(cls, params, in_tensors, qrec: QRec, **kwargs): in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] compute_in_out_scale(qrec) neg_in = at_norm(in_tensor * leak_mult_gen_factor_q7(params), 7) in_tensor = in_tensor * (in_tensor > 0) + neg_in * (in_tensor < 0) scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] in_tensor = scale_mul_biases_q.apply_scales(in_tensor) if qrec.out_qs[0] != qrec.in_qs[0]: return qrec.get_outputs( params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric") return qrec.get_outputs(params, [in_tensor], ktype="symmetric")
def av_pool(params, in_tensors, qrec: QuantizationRecordBase, details=None): del details # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] in_dims = params.in_dims[0] out_dims = params.out_dims[0] filter_sz = params.filter.h * params.filter.w pool_factor = (1 << 16) // filter_sz out_tensor = np.zeros(out_dims.shape, dtype=np.int32) if params.padding.h + params.padding.w > 0: in_tensor = np.pad(in_tensor, params.padding.numpy_pad_shape(in_dims), mode='constant', constant_values=qrec.in_qs[0].pad_zero_point) pad_w = params.padding.w pad_h = params.padding.h else: pad_w = pad_h = 0 for in_c in range(out_dims.c): out_h = 0 for h_idx in range(0, in_dims.h - params.filter.h + pad_h + 1, params.stride.h): out_w = 0 for w_idx in range(0, in_dims.w - params.filter.w + pad_w + 1, params.stride.w): # accumulate - potentially with different Q in_slice_args = in_dims.srange( c=[in_c, in_c + 1, 1], h=[h_idx, h_idx + params.filter.h, 1], w=[w_idx, w_idx + params.filter.w, 1]) sum_filter = np.sum(in_tensor[in_slice_args], dtype=np.int32) sum_filter = np.multiply(sum_filter, pool_factor, dtype=np.int32) out_tensor[out_dims.srange(c=in_c, h=out_h, w=out_w)] = sum_filter out_w += 1 out_h += 1 return qrec.get_outputs( params, [qrec.out_qs[0].clip(at_norm(out_tensor, 16), qrec.out_qs[0].dtype)], ktype="symmetric")
def apply_scales(qbiases, qnorms, arr: np.ndarray, axis: int = None): if axis is None: mul_biases = qbiases mul_biases_norm = qnorms assert len(mul_biases) == 1 and len( mul_biases_norm) == 1, "no axis set. should have single scale" else: shape = [ len(qbiases) if idx == axis else 1 for idx in range(len(arr.shape)) ] mul_biases = qbiases.reshape(shape) mul_biases_norm = qnorms.reshape(shape) return at_norm(np.multiply(arr, mul_biases, dtype=np.int32), mul_biases_norm)
def average_execute_mult(cls, params, in_tensors, qrec: MultQuantizationRecord): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] out_dims = params.out_dims[0] qrec.set_scale(in_idx=0, out_idx=0) sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=tuple( params.axis), keepdims=params.keep_dims) sz = reduce(lambda x, y: x * y, [i for idx, i in enumerate(in_tensor.shape) if idx in params.axis]) res = at_norm(((sum_by_chan << 7) / sz).astype(np.int32), 7) res = out_tensor = qrec.scale_mul_biases_q.apply_scales(res) return qrec.get_outputs(params, [out_tensor.reshape(out_dims.shape)], ktype="symmetric")
def average_execute(cls, params, in_tensors, qrec: MultQuantizationRecord): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] out_dims = params.out_dims[0] sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=tuple( params.axis), keepdims=params.keep_dims) norm = (np.array([31], dtype=np.int32) - gap_clb(sum_by_chan.flatten())).astype(np.int32) sz = reduce(lambda x, y: x * y, [i for idx, i in enumerate(in_tensor.shape) if idx in params.axis]) inv_wh = ((1 << norm) // sz).reshape(sum_by_chan.shape) out_tensor = at_norm((inv_wh * sum_by_chan), norm.reshape(sum_by_chan.shape)) return qrec.get_outputs(params, [qrec.out_qs[0].clip(out_tensor).reshape(out_dims.shape)], ktype="symmetric")
def postprocess(img_in, h, w, c, **kwargs): if kwargs.get('transpose'): if c == 1: img_in = img_in.transpose((1, 0)).reshape((c, h, w)) else: img_in = img_in.transpose((2, 0, 1)).copy() elif c == 1: img_in = img_in.reshape((c, w, h)) divisor = kwargs.get('divisor') or 1 offset = kwargs.get('offset') or 0 shift = kwargs.get('shift') or 0 if shift: if shift < 0: img_in = at_norm(img_in, int(-shift)) else: img_in = img_in << int(shift) img_in = np.array(img_in) norm_func = kwargs.get('norm_func') if norm_func: g_env = {}.update(np.__dict__) # pylint: disable=eval-used compiled_norm_func = eval('lambda ' + norm_func, g_env) img_in = compiled_norm_func(img_in) img_in = np.array(img_in, dtype=np.float) else: img_in = (img_in.astype(np.float) / divisor) + offset if kwargs.get('rgb888_rgb565'): r = np.bitwise_and(img_in[:, :, 0].flatten().astype(np.int16), 0xf8) << 8 g = np.bitwise_and(img_in[:, :, 1].flatten().astype(np.int16), 0xfc) << 3 b = np.bitwise_and(img_in[:, :, 2].flatten().astype(np.int16), 0xf8) >> 3 img_565 = r + g + b img_in = np.array(img_565, dtype=np.int16) return img_in
def execute(cls, params, in_tensors, qrec: QuantizationRecordBase, **kwargs): in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] calc_q = QType.Pow2(bits=32, q=qrec.in_qs[0].q + 15, signed=True) fac_1 = qrec.in_qs[0].quantize(np.array([3.])) fac_2 = (1 << 15) // 6 upper_bound = qrec.in_qs[0].quantize([6.]) lower_bound = qrec.in_qs[0].quantize([0.]) in_tensor = in_tensor.astype(np.int32) in_tensor = at_norm(np.multiply(np.minimum(np.maximum(in_tensor + fac_1, lower_bound), upper_bound), in_tensor, dtype=np.int32), qrec.in_qs[0].q) return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(np.multiply( in_tensor, fac_2, dtype=np.int32), calc_q)], ktype="symmetric")
def av_global_pool_mult(params, in_tensors, qrec: MultQuantizationRecord, details=None): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] in_dims = params.in_dims[0] out_dims = params.out_dims[0] qrec.set_scale(in_idx=0, out_idx=0) sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=(in_dims.get_order_idx('w'), in_dims.get_order_idx('h'))) res = at_norm((sum_by_chan << 7) // (in_dims.h * in_dims.w), 7) res = out_tensor = qrec.scale_mul_biases_q.apply_scales(res) return qrec.get_outputs(params, [out_tensor.reshape(out_dims.shape)], ktype="symmetric")
def hswish(params, in_tensors, qrec: QuantizationRecordBase, details=None): if isinstance(qrec, MultQuantizationRecord): return hswish_mult(params, in_tensors, qrec, details=details) in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] calc_q = QType(bits=32, q=qrec.in_qs[0].q + 15, signed=True) fac_1 = qrec.in_qs[0].quantize(np.array([3.])) fac_2 = (1 << 15) // 6 upper_bound = qrec.in_qs[0].quantize([6.]) lower_bound = qrec.in_qs[0].quantize([0.]) in_tensor = in_tensor.astype(np.int32) in_tensor = at_norm(np.multiply(np.minimum(np.maximum(in_tensor + fac_1, lower_bound), upper_bound), in_tensor, dtype=np.int32), qrec.in_qs[0].q) return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(np.multiply( in_tensor, fac_2, dtype=np.int32), calc_q)], ktype="symmetric")
def windowing(cls, params, in_data, win_lut, win_q): return at_norm(np.multiply(in_data, win_lut, dtype=np.int32), win_q)
def step_kernel(cls, params: GRUParameters, args: Mapping[str, np.ndarray], idx: int, input_tensor: np.ndarray, qrec): z_gate_scratch = args['w_z_b'][0] hr_gate_scratch = args['w_r_b'][0] if idx < params.n_input_cells: # calculate z gate on input z_gate_scratch += args['w_2_z_w'][0].astype(np.int32).dot( input_tensor[idx]) # calculate r gate on input hr_gate_scratch += args['w_2_r_w'][0].astype(np.int32).dot( input_tensor[idx]) # scale to recurrent * state scale if input scale is different if not params.rnn_same_inout_scale: z_gate_scratch = qrec.scale_z_input2_z_HtxW(z_gate_scratch, 0, ktype='symmetric') hr_gate_scratch = qrec.scale_r_input2_r_HtxW(hr_gate_scratch, 0, ktype='symmetric') # calculate z gate on recurrent z_gate_scratch += args['r_2_z_w'][0].astype(np.int32).dot( args['h_state'][0]) + args['r_z_b'][0] # if not hard_act then the scale will scale to Q15 z_gate_scratch = get_activation(params.activation_zr, params.hard_act)( qrec.scale_z_internal(z_gate_scratch, 0, ktype='symmetric'), qrec.internal_qtype) # normalise to internal Q if not params.hard_act and qrec.internal_qtype.q != 15: z_gate_scratch = at_norm(z_gate_scratch, 15 - qrec.internal_qtype.q) # same as above on r gate hr_gate_scratch += args['r_2_r_w'][0].astype(np.int32).dot( args['h_state'][0]) + args['r_r_b'][0] hr_gate_scratch = get_activation( params.activation_zr, params.hard_act)(qrec.scale_r_internal(hr_gate_scratch, 0, ktype='symmetric'), qrec.internal_qtype) if not params.hard_act and qrec.internal_qtype.q != 15: hr_gate_scratch = at_norm(hr_gate_scratch, 15 - qrec.internal_qtype.q) if params.linear_before_reset: # haddamard after linear # r_gate_scratch = (rt (.) (Ht-1*(Rh^T) + Rbh)) h_gate_recurrent = args['r_2_h_w'][0].astype(np.int32).dot( args['h_state'][0]) + args['r_h_b'][0] # this is int_q_scale * state_q_scale * h_recurrent_weights_scale hr_gate_scratch = hr_gate_scratch * h_gate_recurrent # normalize to state_q_scale * h_recurrent_weights_scale hr_gate_scratch = at_norm(hr_gate_scratch, qrec.internal_qtype.q) # ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh) # when linear_before_reset != 0 if idx < params.n_input_cells: if not params.rnn_same_inout_scale: # scale input_scale * h_input_weights_scale to state_q_scale * h_recurrent_weights_scale hr_gate_scratch += qrec.scale_h_input2_h_HtxW( (args['w_2_h_w'][0].astype(np.int32).dot( input_tensor[idx]) + args['w_h_b'][0]), 0, ktype='symmetric') else: # since input_scale == state scale and h_input_weights_scale == h_recurrent_weights_scale # no scaling is necessary hr_gate_scratch += args['w_2_h_w'][0].astype(np.int32).dot( input_tensor[idx]) + args['w_h_b'][0] else: # Is this correct if there is no input (and below)? This is not a mode that # exists in any framework and will not ever be used at present if not params.rnn_same_inout_scale: hr_gate_scratch += qrec.scale_h_input2_h_HtxW( args['w_h_b'][0], 0, ktype='symmetric') else: hr_gate_scratch += args['w_h_b'][0] else: # haddamard on state before linear # r_gate_scratch = (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh # this is int_q_scale * state_q_scale * h_recurrent_weights_scale # normalize to state_q_scale * h_recurrent_weights_scale hr_gate_scratch = at_norm( args['r_2_h_w'][0].astype(np.int32).dot( args['h_state'][0] * hr_gate_scratch), qrec.internal_qtype.q) + args['r_h_b'][0] if idx < params.n_input_cells: if not params.rnn_same_inout_scale: # scale input_scale * h_input_weights_scale to state_q_scale * h_recurrent_weights_scale hr_gate_scratch += qrec.scale_h_input_2_h_HtxW( args['w_2_h_w'][0].dot(input_tensor[idx]) + args['w_h_b'][0], 0, ktype='symmetric') else: hr_gate_scratch += args['w_2_h_w'][0].astype(np.int32).dot( input_tensor[idx]) + args['w_h_b'][0] else: if not params.rnn_same_inout_scale: hr_gate_scratch += qrec.scale_h_input2_h_HtxW( args['w_h_b'][0], 0, ktype='symmetric') else: hr_gate_scratch += args['w_h_b'][0] # scale to q15 or internal Q depending on activation type hr_gate_scratch = get_activation(params.activation, params.hard_act)( qrec.scale_h_internal(hr_gate_scratch, 0, ktype='symmetric'), qrec.internal_qtype) # if not hard then go from Q15 -> int_q if not params.hard_act and qrec.internal_qtype.q != 15: hr_gate_scratch = at_norm(hr_gate_scratch, 15 - qrec.internal_qtype.q) # ----------- SCALE Q7 ----------- # Ht = (1 - zt) (.) ht + zt (.) Ht-1 # zt = (1 - int_q) * Q7 + Q7 * Q7 = INT_Q * 2 # >> and clip h_state = (args['h_state'][0].copy()).astype( np.int32) << (qrec.internal_qtype.q - 7) h_state = qrec.out_qs[0].clip( at_norm( (qrec.internal_qtype.quantize(1) - z_gate_scratch) * hr_gate_scratch + z_gate_scratch * h_state, (qrec.internal_qtype.q * 2) - 7)).astype(qrec.out_qs[0].dtype) args['h_state'][0] = h_state.copy() return h_state
def normalize(obj, n_bits): if n_bits == 0: return obj if n_bits < 0: return obj << -n_bits return at_norm(obj, n_bits)
def _imp_(arr: np.ndarray, scale, scalen): return at_norm(arr.astype(np.int32) * scale, scalen)
def _imp_(arr: np.ndarray, norm): return at_norm(arr.astype(np.int32), norm)
def execute(cls, params, in_tensors, qrec: QRec, **kwargs): in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") offsets = in_tensors[0] scores = in_tensors[1] anchors = in_tensors[2] # decoded_bboxes: Q14 # valid_scores: Q7 anchors_type = "centers" if anchors_type == 'centers': anchors_cnts = anchors else: anchors_cnts = convert_cors2cnts(anchors) set_ssd_scales(qrec, params) scores_q = qrec.in_qs[1] score_threshold = scores_q.quantize(params.nms_score_threshold) decoded_bboxes = [] for i in range(scores.shape[0]): for j in range(scores.shape[1]): if len(decoded_bboxes) > params.max_bb_before_nms: break if scores[i, j] <= score_threshold: continue offset = offsets[i] anchor = anchors[i] # xcnt, ycnt --> Q14 # xcnt = (So*O * Sa*Aw)/params.x_scale + Sa*Ax = So*Sa/params.x_scale (O*Aw + x_scale/So * Ax) = # (scale_x * (O*Aw + (scale_x_anc*Ax)>>scale_x_ancNorm))>>scale_xNorm = # at_norm(scale_x*(O*Aw + at_norm(scale_x_anc*Ax, scale_x_ancNorm)), scale_xNorm) xcenter = qrec.cache['scale_x_q'].apply_scales( np.multiply( offset[CNTX_IDX], anchor[W_IDX], dtype=np.int32) + qrec.cache['scale_x_anc_q'].apply_scales(anchor[CNTX_IDX])) ycenter = qrec.cache['scale_y_q'].apply_scales( np.multiply( offset[CNTY_IDX], anchor[H_IDX], dtype=np.int32) + qrec.cache['scale_y_anc_q'].apply_scales(anchor[CNTY_IDX])) # half_h, half_w --> Q14 # half_h = exp(So*Off / params.h_scale) * Sa*A = Sa/So * exp(So/params.h_scale *O) * A = # (scale_ao * (A* exp17.15(scale_h*O<<15-scale_hNorm))>>scale_aoNorm) = # at_norm(scale_ao*(A*exp17.15(scale_h*O<<15-scale_hNorm)), scale_aoNorm) norm_h = 15 - qrec.cache['scale_h_q'].qnorms norm_w = 15 - qrec.cache['scale_w_q'].qnorms exp_h = exp_fp_17_15( np.multiply(offset[H_IDX], int(qrec.cache['scale_h_q'].qbiases), dtype=np.int32) << norm_h) exp_w = exp_fp_17_15( np.multiply(offset[W_IDX], int(qrec.cache['scale_w_q'].qbiases), dtype=np.int32) << norm_w) half_h = qrec.cache['scale_ao_q'].apply_scales( np.multiply(exp_h, anchor[H_IDX], dtype=np.int32)) >> 1 half_w = qrec.cache['scale_ao_q'].apply_scales( np.multiply(exp_w, anchor[W_IDX], dtype=np.int32)) >> 1 decoded_bboxes.append({ "bbox": [ ycenter - half_h, xcenter - half_w, ycenter + half_h, xcenter + half_w ], "score": scores[i, j], "class": j, "alive": True }) # Bubble sort to sort the scores changed = True while changed: changed = False for i in range(len(decoded_bboxes) - 1): if decoded_bboxes[i]['score'] < decoded_bboxes[i + 1]['score']: temp = decoded_bboxes[i] decoded_bboxes[i] = decoded_bboxes[i + 1] decoded_bboxes[i + 1] = temp changed = True # NMS for idx in range(len(decoded_bboxes)): for idx_int in range(idx + 1, len(decoded_bboxes)): if (not decoded_bboxes[idx_int]['alive']) or ( decoded_bboxes[idx]['class'] != decoded_bboxes[idx_int]['class']): continue intersection = rect_intersect_area( decoded_bboxes[idx]['bbox'], decoded_bboxes[idx_int]['bbox']) union = rect_union_area(decoded_bboxes[idx]['bbox'], decoded_bboxes[idx_int]['bbox']) if intersection >= at_norm( scores_q.quantize(params.nms_iou_threshold) * union, 7): decoded_bboxes[idx_int]['alive'] = False out_boxes = np.zeros((params.max_detections, 4), dtype=qrec.out_qs[0].dtype) out_classes = np.zeros(params.max_detections, dtype=qrec.out_qs[1].dtype) out_scores = np.zeros(params.max_detections, dtype=qrec.out_qs[2].dtype) out_idx = 0 for i in range(len(decoded_bboxes)): if out_idx >= params.max_detections: break bbox = decoded_bboxes[i] if bbox['alive']: out_boxes[out_idx] = bbox['bbox'] out_classes[out_idx] = bbox['class'] out_scores[out_idx] = bbox['score'] out_idx += 1 # decoded_bboxes, valid_scores = cls.decoder( # params, qrec, offsets, anchors, scores, anchors_type='centers') # out_boxes, out_scores, out_classes = cls.nms(params, qrec, decoded_bboxes, valid_scores) # out_count = np.array([sum(out_classes != 0)]) return qrec.get_outputs(params, [out_boxes, out_classes, out_scores], ktype="symmetric")