def parametric_fixed_point_quantize_b_xmax(x, sign=True, n_init=8, n_min=2, n_max=16, xmax_init=1, xmax_min=0.001, xmax_max=10, fix_parameters=False): """Parametric version of `fixed_point_quantize` where the bitwidth `b` and dynamic range `xmax` are learnable parameters. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(v) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) xmax = get_parameter_or_create("xmax", (), ConstantInitializer(xmax_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n = F.round(clip_scalar(n, n_min, n_max)) if sign: n = n - 1 # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, xmax_min, xmax_max) # compute step size from dynamic range and make sure that it is a pow2 d = quantize_pow2(xmax / (2**n - 1)) # compute min/max value that we can represent if sign: xmin = -xmax else: xmin = nn.Variable((1, ), need_grad=False) xmin.d = 0. # broadcast variables to correct size d = broadcast_scalar(d, shape=x.shape) xmin = broadcast_scalar(xmin, shape=x.shape) xmax = broadcast_scalar(xmax, shape=x.shape) # apply fixed-point quantization return d * F.round(F.clip_by_value(x, xmin, xmax) / d)
def transformer(train=True, droput_ratio=0.1): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) with nn.parameter_scope('embedding_layer'): # h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask h = token_embedding(x, vocab_size, embedding_size) h = position_encoding(h) if train: h = F.dropout(h, p=droput_ratio) for i in range(hopping_num): with nn.parameter_scope(f'encoder_hopping_{i}'): h = residual_normalization_wrapper(multihead_self_attention)( h, head_num, mask=mask, train=train, dropout_ratio=droput_ratio) h = residual_normalization_wrapper(positionwise_feed_forward)( h, train=train, dropout_ratio=droput_ratio) with nn.parameter_scope('output_layer'): y = F.sigmoid(PF.affine(h[:, 0, :], 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy(y, t)) return x, y, t, accuracy, loss
def build_self_attention_model(train=True): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant( np.finfo(np.float32).min, shape=mask.shape) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask with nn.parameter_scope('forward'): h_f = lstm(h, hidden_size, mask=mask, return_sequences=True, return_state=False) with nn.parameter_scope('backward'): h_b = lstm(h[:, ::-1, ], hidden_size, mask=mask, return_sequences=True, return_state=False)[:, ::-1, ] h = F.concatenate(h_f, h_b, axis=2) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('da'): a = F.tanh(time_distributed(PF.affine)(h, da)) if train: a = F.dropout(a, p=dropout_ratio) with nn.parameter_scope('r'): a = time_distributed(PF.affine)(a, r) if train: a = F.dropout(a, p=dropout_ratio) a = F.softmax(a + attention_mask, axis=1) m = F.batch_matmul(a, h, transpose_a=True) with nn.parameter_scope('output_mlp'): output = F.relu(PF.affine(m, output_mlp_size)) if train: output = F.dropout(output, p=dropout_ratio) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(output, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy( y, t)) + attention_penalty_coef * frobenius( F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r)) return x, t, accuracy, loss
def quantize_pow2(v): return 2**F.round(F.log(v) / np.log(2.))
def network_size_activations(): """ Returns total number of activations and size in KBytes (NNabla variable using `max` or `sum` operator) """ kbytes = [] num_activations = 0 # get all parameters ps = nn.get_parameters(grad_only=False) for p in ps: if "Asize" in p: print(f"{p}\t{ps[p].d}") num_activations += ps[p].d if cfg.a_quantize is not None: if cfg.a_quantize in ['fp_relu', 'pow2_relu']: # fixed quantization n = nn.Variable((), need_grad=False) n.d = cfg.a_bitwidth elif cfg.a_quantize in [ 'parametric_fp_relu', 'parametric_fp_b_xmax_relu', 'parametric_fp_d_b_relu', 'parametric_pow2_b_xmax_relu', 'parametric_pow2_b_xmin_relu' ]: # parametric quantization s = p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/n") n = F.round( clip_scalar(ps[s], cfg.a_bitwidth_min, cfg.a_bitwidth_max)) elif cfg.a_quantize in ['parametric_fp_d_xmax_relu']: # these quantization methods do not have n, so we need to compute it! # parametric quantization d = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/d")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] # ensure that stepsize is in specified range and a power of two d_q = quantize_pow2( clip_scalar(d, cfg.a_stepsize_min, cfg.a_stepsize_max)) # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max) # compute real `xmax` xmax = F.round(xmax / d_q) * d_q n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0)), cfg.a_bitwidth_min) elif cfg.a_quantize in ['parametric_pow2_xmin_xmax_relu']: # these quantization methods do not have n, so we need to compute it! # parametric quantization xmin = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmin")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] # ensure that dynamic ranges are in specified range and a power-of-two xmin = quantize_pow2( clip_scalar(xmin, cfg.a_xmin_min, cfg.a_xmin_max)) xmax = quantize_pow2( clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max)) # use ceil rounding n = F.maximum_scalar( F.ceil(log2(log2(xmax / xmin) + 1.) + 1.), cfg.a_bitwidth_min) else: raise ValueError("Unknown quantization method {}".format( cfg.a_quantize)) else: # float precision n = nn.Variable((), need_grad=False) n.d = 32. kbytes.append( F.reshape(n * ps[p].d / 8. / 1024., (1, ), inplace=False)) if cfg.target_activation_type == 'max': _kbytes = F.max(F.concatenate(*kbytes)) elif cfg.target_activation_type == 'sum': _kbytes = F.sum(F.concatenate(*kbytes)) return num_activations, _kbytes
def network_size_weights(): """ Return total number of weights and network size (for weights) in KBytes """ kbytes = None num_params = None # get all parameters ps = nn.get_parameters() for p in ps: if ((p.endswith("quantized_conv/W") or p.endswith("quantized_conv/b") or p.endswith("quantized_affine/W") or p.endswith("quantized_affine/b"))): _num_params = np.prod(ps[p].shape) print(f"{p}\t{ps[p].shape}\t{_num_params}") if cfg.w_quantize is not None: if cfg.w_quantize in [ 'parametric_fp_b_xmax', 'parametric_fp_d_b', 'parametric_pow2_b_xmax', 'parametric_pow2_b_xmin' ]: # parametric quantization n_p = p + "quant/" + cfg.w_quantize + "/n" n = F.round( clip_scalar(ps[n_p], cfg.w_bitwidth_min, cfg.w_bitwidth_max)) elif cfg.w_quantize == 'parametric_fp_d_xmax': # this quantization methods do not have n, so we need to compute it d = ps[p + "quant/" + cfg.w_quantize + "/d"] xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"] # ensure that stepsize is in specified range and a power of two d_q = quantize_pow2( clip_scalar(d, cfg.w_stepsize_min, cfg.w_stepsize_max)) # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, cfg.w_xmax_min, cfg.w_xmax_max) # compute real `xmax` xmax = F.round(xmax / d_q) * d_q # we do not clip to `cfg.w_bitwidth_max` as xmax/d_q could correspond to more than 8 bit n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0) + 1.0), cfg.w_bitwidth_min) elif cfg.w_quantize == 'parametric_pow2_xmin_xmax': # this quantization methods do not have n, so we need to compute it xmin = ps[p + "quant/" + cfg.w_quantize + "/xmin"] xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"] # ensure that minimum dynamic range is in specified range and a power-of-two xmin = quantize_pow2( clip_scalar(xmin, cfg.w_xmin_min, cfg.w_xmin_max)) # ensure that maximum dynamic range is in specified range and a power-of-two xmax = quantize_pow2( clip_scalar(xmax, cfg.w_xmax_min, cfg.w_xmax_max)) # use ceil to determine bitwidth n = F.maximum_scalar( F.ceil(log2(log2(xmax / xmin) + 1.0) + 1.), cfg.w_bitwidth_min) elif cfg.w_quantize == 'fp' or cfg.w_quantize == 'pow2': # fixed quantization n = nn.Variable((), need_grad=False) n.d = cfg.w_bitwidth else: raise ValueError( f'Unknown quantization method {cfg.w_quantize}') else: # float precision n = nn.Variable((), need_grad=False) n.d = 32. if kbytes is None: kbytes = n * _num_params / 8. / 1024. num_params = _num_params else: kbytes += n * _num_params / 8. / 1024. num_params += _num_params return num_params, kbytes
def global_average_pooling_1d(x, mask): count = F.sum(mask, axis=1) global_average_pooled = F.sum(h, axis=1) / count return global_average_pooled x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask h = global_average_pooling_1d(h, mask) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(h, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy(y, t)) # Create solver. solver = S.Adam() solver.set_parameters(nn.get_parameters()) trainer = Trainer(inputs=[x, t], loss=loss, metrics={ 'cross entropy': loss, 'accuracy': accuracy }, solver=solver) trainer.run(train_data_iter, dev_data_iter, epochs=5, verbose=1)
def quantize_pow2(v): return 2.**F.round(F.log(F.abs(v)) / np.log(2.))
def parametric_pow2_quantize_b_xmin(x, sign=True, with_zero=True, n_init=8, n_min=1, n_max=8, xmin_init=2**-7, xmin_min=2**-15, xmin_max=256, fix_parameters=False): """Parametric version of `pow2_quantize` where the bitwidth `n` and the smallest value `xmin` are learnable parameters. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(F.abs(v)) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) xmin = get_parameter_or_create("xmin", (), ConstantInitializer(xmin_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n = F.round(clip_scalar(n, n_min, n_max)) if sign: n = n - 1 if with_zero: n = n - 1 # ensure that minimum dynamic range is in specified range and a power-of-two xmin = quantize_pow2(clip_scalar(xmin, xmin_min, xmin_max)) # compute min/max value that we can represent xmax = xmin * (2**((2**n) - 1)) # broadcast variables to correct size xmin = broadcast_scalar(xmin, shape=x.shape) xmax = broadcast_scalar(xmax, shape=x.shape) # if unsigned, then quantize all negative values to zero if not sign: x = F.relu(x) # compute absolute value/sign of input ax = F.abs(x) sx = F.sign(x) if with_zero: # prune smallest elements (in magnitude) to zero if they are smaller # than `x_min / \sqrt(2)` x_threshold = xmin / np.sqrt(2) idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, xmin) idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax) idx3 = F.greater_equal(ax, xmax) else: idx1 = F.less(ax, xmin) idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax) idx3 = F.greater_equal(ax, xmax) # do not backpropagate gradient through indices idx1.need_grad = False idx2.need_grad = False idx3.need_grad = False # do not backpropagate gradient through sign sx.need_grad = False # take care of values outside of dynamic range return sx * (xmin * idx1 + quantize_pow2(ax) * idx2 + xmax * idx3)
def parametric_pow2_quantize(x, sign=True, with_zero=True, n_init=8, n_min=1, n_max=16, m_init=1, m_min=-8, m_max=8, fix_parameters=False): """Parametric version of `pow2_quantize` where the bitwidth `n` and dynamic range `m` are learnable parameters. Args: x(~nnabla.Variable): N-D array as input sign (bool): keep sign information during quantization. with_zero (bool): quantize small weights to zero. n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter. n_min (int): lower bound for bitwidth. n_max (int): upper bound for bitwidth. m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range. m_min (float): lower bound for dynamic range. m_max (float): upper bound for dynamic range. fix_parameters (bool): When set to `True`, the negative slope values will not be updated. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(F.abs(v)) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) m = get_parameter_or_create("m", (), ConstantInitializer(m_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n_q = F.round(clip_scalar(n, n_min, n_max)) if sign: n_q = n_q - 1 if with_zero: n_q = n_q - 1 # ensure that dynamic range is in specified range and an integer m_q = F.round(clip_scalar(m, m_min, m_max)) # compute min/max value that we can represent x_max = 2**m_q x_min = 2**(m_q - (2**n_q) + 1) # broadcast variables to correct size x_min = broadcast_scalar(x_min, shape=x.shape) x_max = broadcast_scalar(x_max, shape=x.shape) # if unsigned, then quantize all negative values to zero if not sign: x = F.relu(x) # compute absolute value/sign of input ax = F.abs(x) sx = F.sign(x) if with_zero: # prune smallest elements (in magnitude) to zero if they are smaller # than `x_min / \sqrt(2)` x_threshold = x_min / np.sqrt(2) idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, x_min) idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max) idx3 = F.greater_equal(ax, x_max) else: idx1 = F.less(ax, x_min) idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max) idx3 = F.greater_equal(ax, x_max) # do not backpropagate gradient through indices idx1.need_grad = False idx2.need_grad = False idx3.need_grad = False # do not backpropagate gradient through sign sx.need_grad = False # take care of values outside of dynamic range return sx * (x_min * idx1 + quantize_pow2(ax) * idx2 + x_max * idx3)
def parametric_fixed_point_quantize(x, sign=True, n_init=8, n_min=2, n_max=16, m_init=1, m_min=-8, m_max=8, fix_parameters=False): """Parametric version of `fixed_point_quantize` where the bitwidth `n` and dynamic range `m` are learnable parameters. Args: x(~nnabla.Variable): N-D array as input sign (bool): keep sign information during quantization. n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter. n_min (int): lower bound for bitwidth. n_max (int): upper bound for bitwidth. m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range. m_min (float): lower bound for dynamic range. m_max (float): upper bound for range. fix_parameters (bool): When set to `True`, the negative slope values will not be updated. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(v) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) m = get_parameter_or_create("m", (), ConstantInitializer(m_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n_q = F.round(clip_scalar(n, n_min, n_max)) if sign: n_q = n_q - 1 # ensure that dynamic range is in specified range m_q = clip_scalar(m, m_min, m_max) # compute step size from dynamic range and make sure that it is a pow2 d_q = quantize_pow2((2**m_q) / (2**n_q - 1)) # compute min/max value that we can represent x_max = d_q * (2**n_q - 1) if sign: x_min = -x_max else: x_min = nn.Variable((1, ), need_grad=False) x_min.d = 0. # broadcast variables to correct size d_q = broadcast_scalar(d_q, shape=x.shape) x_min = broadcast_scalar(x_min, shape=x.shape) x_max = broadcast_scalar(x_max, shape=x.shape) # apply fixed-point quantization return d_q * F.round(F.clip_by_value(x, x_min, x_max) / d_q)