def drop_path(x): """ The same implementation as PyTorch versions. rate: Variable. drop rate. if the random value drawn from uniform distribution is less than the drop_rate, corresponding element becomes 0. """ drop_prob = nn.parameter.get_parameter_or_create("drop_rate", shape=(1, 1, 1, 1), need_grad=False) mask = F.rand(shape=(x.shape[0], 1, 1, 1)) mask = F.greater_equal(mask, drop_prob) x = F.div2(x, 1 - drop_prob) x = F.mul2(x, mask) return x
def absolute_error_backward(inputs): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] x1 = inputs[2] m0 = F.greater_equal(x0, x1) m1 = 1 - m0 m0 = no_grad(m0) m1 = no_grad(m1) dx0 = dy * (m0 - m1) dx1 = -dx0 return dx0, dx1
def parametric_pow2_quantize_xmin_xmax(x, sign=True, with_zero=True, xmin_init=2**-7, xmin_min=2**-15, xmin_max=256, xmax_init=2**0, xmax_min=2**-8, xmax_max=256, fix_parameters=False): """Parametric version of `pow2_quantize` where the min value `xmin` and max value `xmax` are learnable parameters. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2.**F.round(F.log(F.abs(v)) / np.log(2.)) xmin = get_parameter_or_create("xmin", (), ConstantInitializer(xmin_init), need_grad=True, as_need_grad=not fix_parameters) xmax = get_parameter_or_create("xmax", (), ConstantInitializer(xmax_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that minimum dynamic range is in specified range and a power-of-two xmin = quantize_pow2(clip_scalar(xmin, xmin_min, xmin_max)) # ensure that minimum dynamic range is in specified range and a power-of-two xmax = quantize_pow2(clip_scalar(xmax, xmax_min, xmax_max)) # broadcast variables to correct size xmin = broadcast_scalar(xmin, shape=x.shape) xmax = broadcast_scalar(xmax, shape=x.shape) # if unsigned, then quantize all negative values to zero if not sign: x = F.relu(x) # compute absolute value/sign of input ax = F.abs(x) sx = F.sign(x) if with_zero: # prune smallest elements (in magnitude) to zero if they are smaller # than `x_min / \sqrt(2)` x_threshold = xmin / np.sqrt(2) idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, xmin) idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax) idx3 = F.greater_equal(ax, xmax) else: idx1 = F.less(ax, xmin) idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax) idx3 = F.greater_equal(ax, xmax) # do not backpropagate gradient through indices idx1.need_grad = False idx2.need_grad = False idx3.need_grad = False # do not backpropagate gradient through sign sx.need_grad = False # take care of values outside of dynamic range return sx * (xmin * idx1 + quantize_pow2(ax) * idx2 + xmax * idx3)
def parametric_pow2_quantize(x, sign=True, with_zero=True, n_init=8, n_min=1, n_max=16, m_init=1, m_min=-8, m_max=8, fix_parameters=False): """Parametric version of `pow2_quantize` where the bitwidth `n` and dynamic range `m` are learnable parameters. Args: x(~nnabla.Variable): N-D array as input sign (bool): keep sign information during quantization. with_zero (bool): quantize small weights to zero. n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter. n_min (int): lower bound for bitwidth. n_max (int): upper bound for bitwidth. m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range. m_min (float): lower bound for dynamic range. m_max (float): upper bound for dynamic range. fix_parameters (bool): When set to `True`, the negative slope values will not be updated. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(F.abs(v)) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) m = get_parameter_or_create("m", (), ConstantInitializer(m_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n_q = F.round(clip_scalar(n, n_min, n_max)) if sign: n_q = n_q - 1 if with_zero: n_q = n_q - 1 # ensure that dynamic range is in specified range and an integer m_q = F.round(clip_scalar(m, m_min, m_max)) # compute min/max value that we can represent x_max = 2**m_q x_min = 2**(m_q - (2**n_q) + 1) # broadcast variables to correct size x_min = broadcast_scalar(x_min, shape=x.shape) x_max = broadcast_scalar(x_max, shape=x.shape) # if unsigned, then quantize all negative values to zero if not sign: x = F.relu(x) # compute absolute value/sign of input ax = F.abs(x) sx = F.sign(x) if with_zero: # prune smallest elements (in magnitude) to zero if they are smaller # than `x_min / \sqrt(2)` x_threshold = x_min / np.sqrt(2) idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, x_min) idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max) idx3 = F.greater_equal(ax, x_max) else: idx1 = F.less(ax, x_min) idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max) idx3 = F.greater_equal(ax, x_max) # do not backpropagate gradient through indices idx1.need_grad = False idx2.need_grad = False idx3.need_grad = False # do not backpropagate gradient through sign sx.need_grad = False # take care of values outside of dynamic range return sx * (x_min * idx1 + quantize_pow2(ax) * idx2 + x_max * idx3)