def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, itr=1, fix_parameters=False, rng=None, with_bias=True, sn=True, test=False): """ """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) w_sn = spectral_normalization_for_affine( w, itr=itr, test=test) if sn else w b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w_sn, b, base_axis)
def INByBatchNorm(inp, axes=[1], decay_rate=0.9, eps=1e-5, fix_parameters=True): """Instance Normalization (implemented using BatchNormalization) Instance normalization is equivalent to the batch normalization if a batch size is one, in other words, it normalizes over spatial dimension(s), meaning all dimensions except for the batch and feature dimension. """ assert len(axes) == 1 shape_stat = [1 for _ in inp.shape] shape_stat[axes[0]] = inp.shape[axes[0]] beta = get_parameter_or_create("beta", shape_stat, ConstantInitializer(0), not fix_parameters) gamma = get_parameter_or_create("gamma", shape_stat, ConstantInitializer(1), not fix_parameters) mean = get_parameter_or_create("mean", shape_stat, ConstantInitializer(0), False) var = get_parameter_or_create("var", shape_stat, ConstantInitializer(0), False) return F.batch_normalization(inp, beta, gamma, mean, var, axes, decay_rate, eps, batch_stat=True, output_stat=False)
def test_get_parameter_with_initializer(): """Testing with initializer """ import nnabla as nn from nnabla.parameter import get_parameter_or_create nn.clear_parameters() rng = np.random.RandomState(seed=313) shape = (8, 8, 3, 3) # Instnace inherited from BaseInitializer initializer = UniformInitializer(lim=(-1, 1), rng=rng) param1 = get_parameter_or_create('param1', shape, initializer=initializer, need_grad=True) assert np.min(param1.d > -1) and np.max(param1.d < 1) # Numpy array initializer = rng.randn(*shape) param2 = get_parameter_or_create('param2', initializer=initializer, need_grad=True) np.allclose(initializer, param2.d) # Random param3 = get_parameter_or_create('param3', shape, need_grad=True) nn.clear_parameters()
def test_get_parameter_or_create_need_grad(): """Testing if need_grad flag works not not. """ import nnabla as nn from nnabla.parameter import get_parameter_or_create nn.clear_parameters() param1 = get_parameter_or_create('p/param1', (2, 3, 4, 5), need_grad=True) p1d = np.random.randn(*param1.shape).astype(np.float32) p1g = np.random.randn(*param1.shape).astype(np.float32) param1.d = p1d param1.g = p1g param1_f = get_parameter_or_create('p/param1', param1.shape, need_grad=False) assert not param1_f.need_grad assert not param1.need_grad assert np.all(param1.d == p1d) assert np.all(param1.d == param1_f.d) param1.d = 1 assert np.all(param1_f.d == 1) param1_f2 = get_parameter_or_create('p/param1', param1.shape, need_grad=True, as_need_grad=False) assert param1.need_grad assert param1_f.need_grad assert not param1_f2.need_grad nn.clear_parameters()
def convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, itr=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True, sn=True, test=False, init_scale=1.0): """ """ if w_init is None: l, u = calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)) l, u = init_scale * l, init_scale * u w_init = UniformInitializer((l, u), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis] // group) + tuple(kernel), w_init, not fix_parameters) w_sn = spectral_normalization_for_conv(w, itr=itr, test=test) if sn else w b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.convolution(inp, w_sn, b, base_axis, pad, stride, dilation, group)
def __init__(self, hparams, comm=None, test=False, recompute=False, init_method=None, input_mean=None, input_scale=None): super(D3NetMSS, self).__init__(comm=comm, test=test, recompute=recompute, init_method=init_method) self.hparams = hparams if input_mean is None or input_scale is None: input_mean = np.zeros((1, 1, 1, self.hparams['fft_size'] // 2 + 1)) input_scale = np.ones((1, 1, 1, self.hparams['fft_size'] // 2 + 1)) else: input_mean = input_mean.reshape( (1, 1, 1, self.hparams['fft_size'] // 2 + 1)) input_scale = input_scale.reshape( (1, 1, 1, self.hparams['fft_size'] // 2 + 1)) self.in_offset = get_parameter_or_create('in_offset', shape=input_mean.shape, initializer=input_mean) self.in_scale = get_parameter_or_create('in_scale', shape=input_scale.shape, initializer=input_scale) self.decode_scale = get_parameter_or_create( 'decode_scale', (1, 1, 1, self.hparams['valid_signal_idx']), initializer=I.ConstantInitializer(value=1)) self.decode_bias = get_parameter_or_create( 'decode_bias', (1, 1, 1, self.hparams['valid_signal_idx']), initializer=I.ConstantInitializer(value=1))
def modify(self, f, inputs): if f.info.type_name not in self._fct_set: return # Prune the weight x, w = inputs[:2] b = None if len(inputs) == 3: b = inputs[2] output_channel = self.calculate_axis(f) shape = list(range(w.ndim)) shape.pop(output_channel) l2_norm_per_channel = np.sum( w.d ** 2, axis=tuple(shape), keepdims=True) mask = l2_norm_per_channel > self._pruning_threshold scope = self.get_parameter_scope(w) w_pruned, b_pruned = None, None with nn.parameter_scope(scope): w_data = w.d * mask w_pruned = get_parameter_or_create( 'w-pruned', w.shape, w_data, True, True) if b is not None: b_data = b.d * mask.reshape((-1,)) b_pruned = get_parameter_or_create( 'b-pruned', b_data.shape, b_data, True, True) h = self._fct_set[f.info.type_name]( x, w_pruned, b_pruned, **f.info.args) return h
def parametric_fixed_point_quantize_b_xmax(x, sign=True, n_init=8, n_min=2, n_max=16, xmax_init=1, xmax_min=0.001, xmax_max=10, fix_parameters=False): """Parametric version of `fixed_point_quantize` where the bitwidth `b` and dynamic range `xmax` are learnable parameters. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(v) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) xmax = get_parameter_or_create("xmax", (), ConstantInitializer(xmax_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n = F.round(clip_scalar(n, n_min, n_max)) if sign: n = n - 1 # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, xmax_min, xmax_max) # compute step size from dynamic range and make sure that it is a pow2 d = quantize_pow2(xmax / (2**n - 1)) # compute min/max value that we can represent if sign: xmin = -xmax else: xmin = nn.Variable((1, ), need_grad=False) xmin.d = 0. # broadcast variables to correct size d = broadcast_scalar(d, shape=x.shape) xmin = broadcast_scalar(xmin, shape=x.shape) xmax = broadcast_scalar(xmax, shape=x.shape) # apply fixed-point quantization return d * F.round(F.clip_by_value(x, xmin, xmax) / d)
def inq_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, num_bits=4, inq_iterations=(), selection_algorithm='random', seed=-1, w_init=None, i_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Incremental Network Quantization Convolution Layer During training, the weights are sequentially quantized to power-of-two values, which allows the training of a multiplierless network. Using `inq_iterations`, one can specify after how many forward passes half of the learnable weights are fixed and quantized to powers-of-two. After reaching the last value in `inq_iterations`, all weights are fixed. For more details, please refer to the reference. Reference: Zhou A, Yao A, Guo Y, Xu L, Chen Y. Incremental network quantization: Towards lossless CNNs with low-precision weights. <https://arxiv.org/abs/1702.03044> Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. num_bits (int): Number of bits per weight. Value has to be larger than 1 as one bit is already used to code the value "0" inq_iterations (tuple of int): Tuple of iteration numbers at which we fix half of the weights. selection_algorithm (str): Chooses algorithm that is used to decide which weights are fixed. ("largest_abs" ... fix weights with largest absolute value, "random" ... fix weights randomly) seed (int): Random seed for INQ algorithm w_init (~nnabla.initializer.BaseInitializer): Initializer for the weight. i_init (~nnabla.initializer.BaseInitializer): Initializer for the indicators (0 ... learnable, 1 ... fixed). b_init (~nnabla.initializer.BaseInitializer): Initializer for the bias. fix_parameters (bool): When set to `True`, the weight and bias will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if i_init is None: i_init = ConstantInitializer() if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) i = get_parameter_or_create( "I", (outmaps, inp.shape[base_axis]) + tuple(kernel), i_init, False) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.inq_convolution(inp, w, i, b, base_axis, pad, stride, dilation, group, num_bits, inq_iterations, selection_algorithm, seed)
def create_scale_bias(idx, maps, ndim=4): shape = [1] * ndim shape[1] = maps a = get_parameter_or_create("a{}".format(idx), list(shape), None, True, True) b = get_parameter_or_create("b{}".format(idx), list(shape), None, True, True) return a, b
def conv(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True, use_wscale=True, use_he_backward=False): """ """ # Use He backward if use_he_backward: std = calc_normal_std_he_backward(inp.shape[base_axis], outmaps, kernel=kernel) else: std = calc_normal_std_he_forward(inp.shape[base_axis], outmaps, kernel=kernel) # W init if w_init is None and use_wscale: # Equalized Learning Rate w_init = NormalInitializer(1.) w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) w *= std elif w_init is None and not use_wscale: w_init = NormalInitializer(std) w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) else: if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) if with_bias and b_init is None: b_init = ConstantInitializer() b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group)
def dummy_parametric_function(shape, f=10, i=1, s="dummy"): """Doc""" from nnabla import Variable from nnabla.parameter import get_parameter_or_create from nnabla.initializer import UniformInitializer p1 = get_parameter_or_create("p1", shape, UniformInitializer((-1, 1))) p2 = get_parameter_or_create( "p2", shape + (1,), UniformInitializer((-1, 1))) return Variable(shape)
def dummy_parametric_function(shape, f=10, i=1, s="dummy"): """Doc""" from nnabla import Variable from nnabla.parameter import get_parameter_or_create from nnabla.initializer import UniformInitializer p1 = get_parameter_or_create("p1", shape, UniformInitializer((-1, 1))) p2 = get_parameter_or_create("p2", shape + (1, ), UniformInitializer((-1, 1))) return Variable(shape)
def CCBN(h, y, n_classes, decay_rate=0.999, test=False, fix_parameters=False, coefs=[1.0]): """Categorical Conditional Batch Normaliazation""" # Call the batch normalization once shape_stat = [1 for _ in h.shape] shape_stat[1] = h.shape[1] gamma_tmp = nn.Variable.from_numpy_array(np.ones(shape_stat)) beta_tmp = nn.Variable.from_numpy_array(np.zeros(shape_stat)) mean = get_parameter_or_create("mean", shape_stat, ConstantInitializer(0.0), False) var = get_parameter_or_create("var", shape_stat, ConstantInitializer(1.0), False) h = F.batch_normalization(h, beta_tmp, gamma_tmp, mean, var, decay_rate=decay_rate, batch_stat=not test) # Condition the gamma and beta with the class label b, c = h.shape[0:2] def embed_func(y, initializer): if type(y) != list: o = embed(y, n_classes, c, initializer=initializer, sn=False, test=test) else: y_list = y o = reduce(lambda x, y: x + y, [ coef * embed(y, n_classes, c, initializer=initializer, sn=False, test=test) for coef, y in zip(coefs, y_list) ]) return o with nn.parameter_scope("gamma"): gamma = embed_func(y, ConstantInitializer(1.0)) gamma = F.reshape(gamma, [b, c] + [1 for _ in range(len(h.shape[2:]))]) gamma = F.broadcast(gamma, h.shape) with nn.parameter_scope("beta"): beta = embed_func(y, ConstantInitializer(0.0)) beta = F.reshape(beta, [b, c] + [1 for _ in range(len(h.shape[2:]))]) beta = F.broadcast(beta, h.shape) return gamma * h + beta
def LN(inp, fix_parameters=False): """Layer normalization. """ beta_shape = (1, inp.shape[1], 1, 1) gamma_shape = (1, inp.shape[1], 1, 1) beta = get_parameter_or_create("beta", beta_shape, ConstantInitializer(0), not fix_parameters) gamma = get_parameter_or_create("gamma", gamma_shape, ConstantInitializer(1), not fix_parameters) return f_layer_normalization(inp, beta, gamma)
def convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ N-D Convolution with a bias term. For Dilated Convolution (a.k.a. Atrous Convolusion), refer to: - Chen et al., DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs. https://arxiv.org/abs/1606.00915 - Yu et al., Multi-Scale Context Aggregation by Dilated Convolutions. https://arxiv.org/abs/1511.07122 Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group)
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True, use_wscale=True, use_he_backward=False): """ """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) # Use He backward if use_he_backward: std = calc_normal_std_he_backward(inp.shape[base_axis], n_outmap) else: std = calc_normal_std_he_forward(inp.shape[base_axis], n_outmap) # W init if w_init is None and use_wscale: # Equalized Learning Rate w_init = NormalInitializer(1.) w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) w *= std elif w_init is None and not use_wscale: w_init = NormalInitializer(std) w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) else: if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], n_outmaps), rng=rng) w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) if with_bias and b_init is None: b_init = ConstantInitializer() b = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def noisy_layer(x, out_size, name): inpt_size = x.shape[1] root_p = np.sqrt(inpt_size) mu_init = UniformInitializer((-1.0 / root_p, 1.0 / root_p)) sig_init = ConstantInitializer(0.5 / root_p) eps_w, eps_b = sample_noise(inpt_size, out_size) with nn.parameter_scope(name): mu_w = get_parameter_or_create('mu_w', (inpt_size, out_size), mu_init) sig_w = get_parameter_or_create('sig_w', (inpt_size, out_size), sig_init) mu_b = get_parameter_or_create('mu_b', (out_size, ), mu_init) sig_b = get_parameter_or_create('sig_b', (out_size, ), sig_init) return F.affine(x, mu_w + sig_w * eps_w, mu_b + sig_b * eps_b)
def batch_normalization(inp, axes=[1], decay_rate=0.9, eps=1e-5, batch_stat=True, output_stat=False): """ Batch normalization layer. .. math:: \\begin{array}{lcl} \\mu &=& \\frac{1}{M} \\sum x_i\\\\ \\sigma^2 &=& \\frac{1}{M} \\left(\\sum x_i - \\mu\\right)^2\\\\ \\hat{x}_i &=& \\frac{x_i - \\mu}{\\sqrt{\\sigma^2 + \\epsilon}} \\\\ y_i &=& \\hat{x}_i \\gamma + \\beta. \\end{array} where :math:`x_i, y_i` are the inputs. In testing, the mean and variance computed by moving average calculated during training are used. Args: inp (~nnabla.Variable): N-D array of input. axes (:obj:`tuple` of :obj:`int`): Axes mean and variance are taken. decay_rate (float): Decay rate of running mean and variance. eps (float): Tiny value to avoid zero division by std. batch_stat (bool): Use mini-batch statistics rather than running ones. output_stat (bool): Output batch mean and variance. Returns: :class:`~nnabla.Variable`: N-D array. References: - Ioffe and Szegedy, Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. https://arxiv.org/abs/1502.03167 """ assert len(axes) == 1 shape_stat = [1 for _ in inp.shape] shape_stat[axes[0]] = inp.shape[axes[0]] beta = get_parameter_or_create("beta", shape_stat, ConstantInitializer(0), True) gamma = get_parameter_or_create("gamma", shape_stat, ConstantInitializer(1), True) mean = get_parameter_or_create("mean", shape_stat, ConstantInitializer(0), False) var = get_parameter_or_create("var", shape_stat, ConstantInitializer(0), False) return F.batch_normalization(inp, beta, gamma, mean, var, axes, decay_rate, eps, batch_stat, output_stat)
def deconvolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ Deconvolution layer. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of deconvolution kernels (which is equal to the number of output channels). For example, to apply deconvolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply deconvolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( outmaps, inp.shape[base_axis], tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (inp.shape[base_axis], outmaps / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.deconvolution(inp, w, b, base_axis, pad, stride, dilation, group)
def BN(inp, axes=[1], decay_rate=0.9, eps=1e-5, batch_stat=True, output_stat=False, fix_parameters=False): """Batch Normalization """ shape_stat = [1 for _ in inp.shape] shape_stat[axes[0]] = inp.shape[axes[0]] beta = get_parameter_or_create( "beta", shape_stat, ConstantInitializer(0), not fix_parameters) gamma = get_parameter_or_create( "gamma", shape_stat, ConstantInitializer(1), not fix_parameters) mean = get_parameter_or_create( "mean", shape_stat, ConstantInitializer(0), False) var = get_parameter_or_create( "var", shape_stat, ConstantInitializer(0), False) return F.batch_normalization(inp, beta, gamma, mean, var, axes, decay_rate, eps, batch_stat, output_stat)
def spectral_normalization_for_conv(w, itr=1, eps=1e-12, test=False): w_shape = w.shape d0 = w.shape[0] # Out d1 = np.prod(w.shape[1:]) # In u0 = get_parameter_or_create( "singular-vector", [d0], NormalInitializer(), False) return F.spectral_norm(w, u0, dim=0, itr=itr, eps=eps, test=test)
def svd_convolution(x, n_outputs, kernel, pad, with_bias, cr): W = get_parameter('conv/W') if W is None: UV = None else: UV = W.d b = get_parameter('conv/b') # compute rank (size of intermediate activations) # to obtained desired reduction inmaps = x.shape[1] outmaps = n_outputs Ksize = np.prod(kernel) rank = int( np.floor((1 - cr) * inmaps * outmaps * Ksize / (inmaps * Ksize + inmaps * outmaps))) # Initialize bias to existing b in affine if exists if b is not None: b_new = get_parameter_or_create('svd_conv/b', b.d.shape, need_grad=b.need_grad) b_new.d = b.d.copy() logger.info( "SVD convolution created: inmaps = {}; outmaps = {}; compression = {}; rank = {};" .format(inmaps, outmaps, cr, rank)) # create svd_convolution initialized from W in current context if it exists return PF.svd_convolution(x, n_outputs, kernel=kernel, r=rank, pad=pad, with_bias=with_bias, uv_init=UV)
def test_parameter_scope_slash(): """Testing if parameter_scope('aaa/bbb') works. """ import nnabla as nn from nnabla.parameter import get_parameter_or_create nn.clear_parameters() with nn.parameter_scope('aaa/bbb'): param = get_parameter_or_create('ccc', (2, 3, 4, 5)) ref = np.random.randn(*param.shape).astype(np.float32) param.d = ref with nn.parameter_scope('aaa'): with nn.parameter_scope('bbb'): param = get_parameter_or_create('ccc', (2, 3, 4, 5)) assert np.all(param.d == ref) nn.clear_parameters()
def embed(inp, n_inputs, n_features, initializer=None, fix_parameters=False, apply_w=None): """ Embed. Embed slices a matrix/tensor with indexing array/tensor. Weights are initialized with :obj:`nnabla.initializer.UniformInitializer` within the range of :math:`-\\sqrt{3}` and :math:`\\sqrt{3}`. Args: x(~nnabla.Variable): [Integer] Indices with shape :math:`(I_0, ..., I_N)` n_inputs : number of possible inputs, words or vocabraries n_features : number of embedding features fix_parameters (bool): When set to `True`, the embedding weight matrix will not be updated. apply_w (function): Lambda, function, or callable object applied to the weights. Returns: ~nnabla.Variable: Output with shape :math:`(I_0, ..., I_N, W_1, ..., W_M)` """ if initializer is None: initializer = UniformInitializer((-np.sqrt(3.), np.sqrt(3))) w = get_parameter_or_create("W", [n_inputs, n_features], initializer, True, not fix_parameters) if apply_w is not None: w = apply_w(w) return F.embed(inp, w)
def attnblock(h, r=8, fix_parameters=False, sn=True, test=False): """Attention block""" x = h # 1x1 convolutions b, c, s0, s1 = h.shape c_r = c // r assert c_r > 0 f_x = convolution(h, c_r, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="f", with_bias=False, sn=sn, test=test) g_x = convolution(h, c_r, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="g", with_bias=False, sn=sn, test=test) h_x = convolution(h, c, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="h", with_bias=False, sn=sn, test=test) # Attend attn = F.batch_matmul(f_x.reshape( [b, c_r, -1]), g_x.reshape([b, c_r, -1]), transpose_a=True) attn = F.softmax(attn, 1) h_x = h_x.reshape([b, c, -1]) o = F.batch_matmul(h_x, attn) o = F.reshape(o, [b, c, s0, s1]) # Shortcut gamma = get_parameter_or_create( "gamma", [1, 1, 1, 1], ConstantInitializer(0.), not fix_parameters) y = gamma * o + x return y
def svd_affine(x, n_outputs, cr): W = get_parameter('affine/W') if W is None: UV = None else: UV = W.d b = get_parameter('affine/b') # compute rank (size of intermediate activations) # to obtained desired reduction inshape = np.prod(x.shape[1:]) outshape = np.prod(n_outputs) rank = int( np.floor((1 - cr) * inshape * outshape / (inshape + outshape))) # Initialize bias to existing b in affine if exists if b is not None: b_new = get_parameter_or_create('svd_affine/b', b.d.shape, need_grad=b.need_grad) b_new.d = b.d.copy() logger.info( "SVD affine created: input_shape = {}; output_shape = {}; compression = {}; rank = {};" .format(inshape, outshape, cr, rank)) # create svd_affine initialized from W in current context if it exists return PF.svd_affine(x, n_outputs, rank, uv_init=UV)
def modify(self, f, inputs): fname = f.info.type_name if not fname in self._fct_set: return # Next or Previous func is not BatchNorm next_func = f.outputs[0].function_references[0] prev_func = f.inputs[0].parent if (prev_func == None or prev_func.info.type_name != 'BatchNormalization') \ and next_func.info.type_name != 'BatchNormalization': return x = inputs[0] w = inputs[1] b = inputs[2] if len(inputs) == 3 else None if b is not None: return scope = self.get_parameter_scope(w) n_outmaps = w.shape[1] if fname == 'Affine' else w.shape[0] with nn.parameter_scope(scope): b = get_parameter_or_create('b', (n_outmaps, ), ConstantInitializer(), True, True) h = self.connect(f, x, w, b) return h
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """ The affine layer, also known as the fully connected layer. Computes .. math:: {\\mathbf y} = {\\mathbf A} {\\mathbf x} + {\\mathbf b}. where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and outputs respectively, and :math:`{\\mathbf A}, {\\mathbf b}` are constants. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)f """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer(calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """ The affine layer, also known as the fully connected layer. Computes .. math:: {\\mathbf y} = {\\mathbf A} {\\mathbf x} + {\\mathbf b}. where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and outputs respectively, and :math:`{\\mathbf A}, {\\mathbf b}` are constants. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)f """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def batch_normalization(inp, axes=[1], decay_rate=0.9, eps=1e-5, batch_stat=True, output_stat=False): """ Batch normalization layer. .. math:: \\begin{array}{lcl} \\mu &=& \\frac{1}{M} \\sum x_i\\\\ \\sigma^2 &=& \\frac{1}{M} \\left(\\sum x_i - \\mu\\right)^2\\\\ \\hat{x}_i &=& \\frac{x_i - \\mu}{\\sqrt{\\sigma^2 + \\epsilon}} \\\\ y_i &=& \\hat{x}_i \\gamma + \\beta. \\end{array} where :math:`x_i, y_i` are the inputs. In testing, the mean and variance computed by moving average calculated during training are used. Args: inp (~nnabla.Variable): N-D array of input. axes (:obj:`tuple` of :obj:`int`): Axes mean and variance are taken. decay_rate (float): Decay rate of running mean and variance. eps (float): Tiny value to avoid zero division by std. batch_stat (bool): Use mini-batch statistics rather than running ones. output_stat (bool): Output batch mean and variance. Returns: :class:`~nnabla.Variable`: N-D array. References: - Ioffe and Szegedy, Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. https://arxiv.org/abs/1502.03167 """ assert len(axes) == 1 shape_stat = [1 for _ in inp.shape] shape_stat[axes[0]] = inp.shape[axes[0]] beta = get_parameter_or_create( "beta", shape_stat, ConstantInitializer(0), True) gamma = get_parameter_or_create( "gamma", shape_stat, ConstantInitializer(1), True) mean = get_parameter_or_create( "mean", shape_stat, ConstantInitializer(0), False) var = get_parameter_or_create( "var", shape_stat, ConstantInitializer(0), False) return F.batch_normalization(inp, beta, gamma, mean, var, axes, decay_rate, eps, batch_stat, output_stat)
def convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ N-D Convolution with a bias term. For Dilated Convolution (a.k.a. Atrous Convolusion), refer to: - Chen et al., DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs. https://arxiv.org/abs/1606.00915 - Yu et al., Multi-Scale Context Aggregation by Dilated Convolutions. https://arxiv.org/abs/1511.07122 Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group)
def embed(inp, n_inputs, n_features, initializer=None, itr=1, fix_parameters=False, sn=True, test=False): """ """ w = get_parameter_or_create("W", [n_inputs, n_features], initializer, not fix_parameters) w_sn = spectral_normalization_for_affine( w, itr=itr, test=test) if sn else w return F.embed(inp, w_sn)
def _init_beta_gamma(shape, fix_parameters, param_init, no_bias, no_scale): from nnabla.parameter import get_parameter_or_create from nnabla.initializer import ConstantInitializer if no_bias: beta = None else: beta_init = param_init.get('beta', ConstantInitializer(0)) beta = get_parameter_or_create("beta", shape, beta_init, True, not fix_parameters) if no_scale: gamma = None else: gamma_init = param_init.get('gamma', ConstantInitializer(1)) gamma = get_parameter_or_create("gamma", shape, gamma_init, True, not fix_parameters) return beta, gamma
def test_get_parameter_or_create_need_grad(): """Testing if need_grad flag works not not. """ import nnabla as nn from nnabla.parameter import get_parameter_or_create nn.clear_parameters() param1 = get_parameter_or_create('param1', (2, 3, 4, 5), need_grad=True) p1d = np.random.randn(*param1.shape).astype(np.float32) p1g = np.random.randn(*param1.shape).astype(np.float32) param1.d = p1d param1.g = p1g param1_f = get_parameter_or_create('param1', param1.shape, need_grad=False) assert not param1_f.need_grad assert param1.need_grad assert np.all(param1.d == p1d) assert np.all(param1.d == param1_f.d) param1.d = 1 assert np.all(param1_f.d == 1) nn.clear_parameters()
def _create_variable(v, name, shape): # Create and initialize variables class Variable: pass parameter = v.type == "Parameter" variable_instance = None if parameter: if v.initializer.type == 'Normal': initializer = NormalInitializer(v.initializer.multiplier) elif v.initializer.type == 'NormalAffineHe' or v.initializer.type == 'NormalAffineHeForward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_forward( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalAffineHeBackward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_backward( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalAffineGlorot': initializer = (lambda shape: NormalInitializer(calc_normal_std_glorot( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionHe' or v.initializer.type == 'NormalConvolutionHeForward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_forward( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionHeBackward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_backward( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionGlorot': initializer = (lambda shape: NormalInitializer(calc_normal_std_glorot( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'Uniform': initializer = UniformInitializer( lim=[-v.initializer.multiplier, v.initializer.multiplier]) elif v.initializer.type == 'UniformAffineGlorot': initializer = (lambda shape: UniformInitializer(calc_uniform_lim_glorot( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'UniformConvolutionGlorot': initializer = (lambda shape: UniformInitializer(calc_uniform_lim_glorot( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'Constant': initializer = ConstantInitializer(value=v.initializer.multiplier) else: initializer = None variable_instance = get_parameter_or_create(name, shape, initializer) else: # create empty variable, memory will be allocated in network.setup() # after network optimization variable_instance = nn.Variable() variable = Variable() variable.name = name variable.parameter = parameter variable.shape = shape variable.variable_instance = variable_instance return variable
def deconvolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ Deconvolution layer. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of deconvolution kernels (which is equal to the number of output channels). For example, to apply deconvolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply deconvolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(outmaps, inp.shape[base_axis], tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (inp.shape[base_axis], outmaps / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.deconvolution(inp, w, b, base_axis, pad, stride, dilation, group)
def embed(inp, n_inputs, n_features): """ Embed. Embed slices a matrix/tensor with indexing array/tensor Args: x(~nnabla.Variable): [Integer] Indices with shape :math:`(I_0, ..., I_N)` n_inputs : number of possible inputs, words or vocabraries n_features : number of embedding features Returns: ~nnabla.Variable: Output with shape :math:`(I_0, ..., I_N, W_1, ..., W_M)` """ w = get_parameter_or_create("W", [n_inputs, n_features], UniformInitializer((-np.sqrt(3.), np.sqrt(3))), True) return F.embed(inp, w)
def prelu(inp, base_axis=1, shared=True): """ Parametrized Rectified Linear Unit function defined as .. math:: y_i = \max(0, x_i) + w_i \min(0, -x_i) where nagative slope :math:`w` is learned and can vary accros channels (an axis specified with base_axis). Args: x(~nnabla.Variable): N-D array as input base_axis(int): Dimensions up to base_axis is treated as sample dimension. shared(bool): Use shared weight value or not Returns: ~nnabla.Variable: N-D array. """ shape = tuple() if shared else inp.shape[base_axis] w = get_parameter_or_create("W", shape, ConstantInitializer(-1), True) return F.prelu(inp, w, base_axis)
def binary_connect_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, wb_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Binary Connect Convolution, multiplier-less inner-product. Binary Connect Convolution is the convolution function, except the definition of the inner product is modified. The input-output relation of this function is as follows: .. math:: y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}. Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product simplifies to addition. This function should be used together with BatchNormalization. References: M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect: Training Deep Neural Networks with binary weights during propagations." Advances in Neural Information Processing Systems. 2015. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if wb_init is None: wb_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.binary_connect_convolution(inp, w, wb, b, base_axis, pad, stride, dilation, group)
def binary_connect_affine(inp, n_outmaps, base_axis=1, w_init=None, wb_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """Binary Connect Affine, multiplier-less inner-product. Binary Connect Affine is an affine function, except the definition of the inner product is modified. The input-output relation of this function is as follows: .. math:: y_i = \sum_{i} sign(w_i) x_i. Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product simplifies to addition. This function should be used together with Batch Normalization. References: M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect: Training Deep Neural Networks with binary weights during propagations." Advances in Neural Information Processing Systems. 2015. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. Returns: :class:`~nnabla.Variable` """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: fan_in = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if wb_init is None: fan_in = np.prod(inp.shape[base_axis:]) wb_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, wb_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.binary_connect_affine(inp, w, wb, b, base_axis)
def binary_weight_affine(inp, n_outmaps, base_axis=1, w_init=None, wb_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """Binary Weight Affine, multiplier-less inner-product with a scale factor. Binary Weight Affine is the affine function, but the inner product in this function is the following, .. math:: y_j = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}} \sum_{i} sign(w_{ji}) x_i Therefore :math:`sign(w_{ji})` is either :math:`1` or :math:`-1` and the inner product simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}}`. The number of ::math:`\\alpha` is the outmaps of the affine function. References: Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks." arXiv preprint arXiv:1603.05279 (2016). .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for the weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for the binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for the bias. fix_parameters (bool): When set to `True`, the weight and bias will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: fan_in = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if wb_init is None: fan_in = np.prod(inp.shape[base_axis:]) wb_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, wb_init, not fix_parameters) alpha = get_parameter_or_create( "alpha", n_outmaps, ConstantInitializer(0), False) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.binary_weight_affine(inp, w, wb, alpha, b, base_axis)
def binary_weight_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, wb_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Binary Weight Convolution, multiplier-less inner-product with a scale factor. Binary Weight Convolution is the convolution function, but the inner product in this function is the following, .. math:: y_{n, a, b} = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}} \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}. Therefore :math:`sign(w_{n, m, i, j})` is either :math:`1` or :math:`-1` and the inner product simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}}`. The number of :math:`n` is the number of outmaps of the convolution function. References: Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks." arXiv preprint arXiv:1603.05279 (2016). .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if wb_init is None: wb_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) alpha = get_parameter_or_create( "alpha", (outmaps, ), ConstantInitializer(0), False) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.binary_weight_convolution(inp, w, wb, alpha, b, base_axis, pad, stride, dilation, group)