def check_forward(self, log_pi_data, tau): log_pi = chainer.Variable(log_pi_data) y = functions.gumbel_softmax(log_pi, tau=tau) # Only checks dtype and shape because its result contains noise self.assertEqual(y.dtype, numpy.float32) self.assertEqual(y.shape, log_pi.shape) self.assertEqual( backend.get_array_module(y), backend.get_array_module(log_pi))
def check_forward(self, e1_data, e2_data, W_data, V1_data, V2_data, b_data): e1 = chainer.Variable(e1_data) e2 = chainer.Variable(e2_data) W = chainer.Variable(W_data) e1_data = e1_data.reshape(e1_data.shape[0], -1) e2_data = e2_data.reshape(e2_data.shape[0], -1) xp = backend.get_array_module(e1) y_expect = xp.einsum('ij,ik,jkl->il', e1_data, e2_data, W_data) flags = V1_data is None, V2_data is None, b_data is None if any(flags): if not all(flags): raise ValueError( 'Test either all or none of the optional parameters.') y = functions.bilinear(e1, e2, W) else: V1 = chainer.Variable(V1_data) V2 = chainer.Variable(V2_data) b = chainer.Variable(b_data) y = functions.bilinear(e1, e2, W, V1, V2, b) y_expect = xp.einsum('ij,ik,jkl->il', e1_data, e2_data, W_data) y_expect += e1_data.dot(V1_data) y_expect += e2_data.dot(V2_data) y_expect += b_data testing.assert_allclose(y_expect, cuda.to_cpu(y.data)) assert y.data.dtype == e1_data.dtype
def update_approximate_vectors( weight_matrix, u, n_power_iteration, eps): """Update the first left and right singular vectors. This function updates the first left singular vector `u` and the first right singular vector `v`. Args: weight_matrix (~chainer.Variable): 2D weight. u (numpy.ndarray, cupy.ndarray, or None): Vector that approximates the first left singular vector and has the shape of (out_size,). n_power_iteration (int): Number of iterations to approximate the first right and left singular vectors. Returns: :class:`numpy.ndarray` or `cupy.ndarray`: Approximate first left singular vector. :class:`numpy.ndarray` or `cupy.ndarray`: Approximate first right singular vector. """ weight_matrix = weight_matrix.array xp = backend.get_array_module(weight_matrix) for _ in range(n_power_iteration): v = l2normalize(xp, xp.dot(u, weight_matrix), eps) u = l2normalize(xp, xp.dot(weight_matrix, v), eps) return u, v
def check_backward(self, x_data, W_data, b_data, y_grad, use_cudnn='never'): if not self.c_contiguous: xp = backend.get_array_module(x_data) x_data = xp.asfortranarray(x_data) W_data = xp.asfortranarray(W_data) y_grad = xp.asfortranarray(y_grad) self.assertFalse(x_data.flags.c_contiguous) self.assertFalse(W_data.flags.c_contiguous) self.assertFalse(y_grad.flags.c_contiguous) if b_data is not None: b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype) b[::2] = b_data b_data = b[::2] self.assertFalse(b_data.flags.c_contiguous) args = (x_data, W_data) if b_data is not None: args += (b_data,) def f(*args): return F.deconvolution_nd(*args, stride=self.stride, pad=self.pad, outsize=self.outsize, dilate=self.dilate, groups=self.groups) with chainer.using_config('use_cudnn', use_cudnn): with chainer.using_config('autotune', self.autotune): gradient_check.check_backward( f, args, y_grad, **self.check_backward_options)
def variable_repr(var): """Return the string representation of a variable. Args: var (~chainer.Variable): Input Variable. .. seealso:: numpy.array_repr """ xp = backend.get_array_module(var) if xp is numpy: arr = var.data else: arr = var.data.get() if var.name: prefix = 'variable ' + var.name else: prefix = 'variable' if arr is None: lst = 'None' elif arr.size > 0 or arr.shape == (0,): lst = numpy.array2string(arr, None, None, None, ', ', prefix + '(') else: # show zero-length shape unless it is (0,) lst = '[], shape=%s' % (repr(arr.shape),) return '%s(%s)' % (prefix, lst)
def forward(self, x): self.retain_inputs(()) dims = x[0].shape[2:] ndim = self.ndim ksize = self.ksize stride = self.stride pad = self.pad if self.outs is None: self.outs = tuple( conv.get_deconv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in six.moves.zip(dims, ksize, stride, pad)) xp = backend.get_array_module(*x) colon = slice(None) # (:, :, None, None, ..., None) tile_index = (colon, colon) + (None,) * ndim # (1, 1, k_1, k_2, ..., k_n, 1, 1, ..., 1) tile_reps = (1, 1) + ksize + (1,) * ndim col = xp.tile(x[0][tile_index], tile_reps) if xp is numpy: col2im_nd = conv_nd.col2im_nd_cpu else: col2im_nd = conv_nd.col2im_nd_gpu y = col2im_nd(col, stride, pad, self.outs) return y,
def backward(self, indexes, grad_outputs): x, = self.get_retained_inputs() xp = backend.get_array_module(x) y, = self.get_retained_outputs() gy, = grad_outputs F = chainer.functions axis = self.axis if axis is None: shape = x.shape axis = 0 x = F.flatten(x) else: shape = None if axis < 0: axis += y.ndim if y.shape[axis] <= 1: gx = gy else: _, x = F.split_axis(x, (1,), axis) gx = _flipcumprodsum(x, gy, axis) y, ylast = F.split_axis(y, (-1,), axis) gx *= F.concat([xp.ones_like(ylast.array), y], axis=axis) if shape is not None: gx = F.reshape(gx, shape) return gx,
def forward(self, inputs): gy, = inputs xp = backend.get_array_module(*inputs) gx = xp.zeros(self._in_shape, gy.dtype) if xp is numpy: try: numpy.add.at(gx, self.slices, gy) except IndexError: done = False # In numpy<1.13, 0-dim boolean index is not supported in # numpy.add.at and it's supported for 0-dim arr in # arr.__getitem__. if not _numpy_supports_0d_bool_index and len(self.slices) == 1: idx = numpy.asanyarray(self.slices[0]) if idx.dtype == numpy.dtype(bool): # Convert the array and the mask to 1-dim. # numpy.add.at with them is supported in older numpy. numpy.add.at(gx[None], idx[None], gy) done = True if not done: msg = ''' GetItem does not support backward for this slices. The slices argument is not supported by numpy.add.at, while it is supported by numpy.ndarray.__getitem__. Please report this error to the issue tracker with the stack trace, the information of your environment, and your script: https://github.com/chainer/chainer/issues/new. ''' raise IndexError(msg) else: gx.scatter_add(self.slices, inputs[0]) return gx,
def forward(self, inputs): xp = backend.get_array_module(inputs[0]) self.input_length, label_length, t, xs = inputs if self.zero_padding is None: if xs.dtype == numpy.float16: self.zero_padding = -10000.0 else: self.zero_padding = -10000000000.0 if chainer.is_debug(): assert len(xs) >= xp.max(self.input_length) assert t.shape[1] >= xp.max(label_length) self.path_length = 2 * label_length + 1 self.yseq = _softmax(xs, xp) log_yseq = self.log_matrix(self.yseq, xp) self.path = _label_to_path(t, self.blank_symbol, xp) self.prob_trans = self.calc_trans( log_yseq, self.input_length, t, label_length, self.path, self.path_length, xp) loss = -_logsumexp(self.prob_trans[0], xp, axis=1) if self.reduce == 'mean': loss = utils.force_array(xp.mean(loss)) return loss,
def backward(self, indexes, grad_outputs): anchor, positive, negative = self.get_retained_inputs() N = anchor.shape[0] x_dim = anchor.shape[1] xp = backend.get_array_module(anchor) tmp = xp.repeat(self.dist_hinge[:, None], x_dim, axis=1) mask = xp.array(tmp > 0, dtype=anchor.dtype) gy, = grad_outputs if self.reduce == 'mean': g = gy / N else: g = gy[:, None] tmp = 2 * chainer.functions.broadcast_to(g, mask.shape) * mask ret = [] if 0 in indexes: ret.append(tmp * (negative - positive)) if 1 in indexes: ret.append(tmp * (positive - anchor)) if 2 in indexes: ret.append(tmp * (anchor - negative)) return ret
def forward(self, inputs): self.retain_inputs((0, 1)) scale = inputs[1].dtype.type(1. / (1 - self.ratio)) xp = backend.get_array_module(*inputs) if self.mask is None: if self.use_batchwise_mask: mask_shape = (inputs[0].shape[0], inputs[1].shape[0], inputs[1].shape[1]) else: mask_shape = (inputs[1].shape[0], inputs[1].shape[1]) if xp == numpy: self.mask = xp.random.rand(*mask_shape) >= self.ratio else: self.mask = xp.random.rand(*mask_shape, dtype=numpy.float32) >= self.ratio elif isinstance(self.mask, variable.Variable): self.mask = self.mask.data x = _as_mat(inputs[0]) W = inputs[1] * scale * self.mask # (i)jk,ik->ij y = _matmul(W, x[:, :, None], xp) y = y.reshape(y.shape[0], y.shape[1]).astype(x.dtype, copy=False) if len(inputs) == 3: b = inputs[2] y += b return y,
def forward(self, inputs): if inputs[0].shape[1] % self.groups != 0: raise ValueError('The number of channels {} is not divisible by ' '\'groups\' argument {}.' .format(inputs[0].shape[1], self.groups)) xp = backend.get_array_module(*inputs) if xp is cuda.cupy and chainer.should_use_cudnn('>=auto', 5000): return self.forward_cudnn(inputs) self.retain_inputs((0, 1)) x, gamma, beta = inputs orig_shape = x.shape batch_size, channels = orig_shape[:2] groups = self.groups reduced_shape = (batch_size * groups, -1) x = x.reshape(reduced_shape) self.mean = x.mean(axis=1) x_hat = x - self.mean[:, None] var = (x_hat * x_hat).mean(axis=1) var += self.eps self.inv_std = var del var xp.sqrt(self.inv_std, out=self.inv_std, dtype=x.dtype) xp.reciprocal(self.inv_std, out=self.inv_std) x_hat *= self.inv_std[:, None] y = x_hat.reshape((batch_size, channels, -1)) y *= gamma[:, None] y += beta[:, None] y = y.reshape(orig_shape) return y,
def check_backward_consistency_regression(self, backend_config): # Regression test to two-dimensional unpooling layer. x_data, = self.generate_inputs() gy_data = numpy.random.uniform(-1, 1, self.gy_shape).astype(self.dtype) ksize = self.ksize stride = self.stride pad = self.pad xp = backend.get_array_module(x_data) # Backward computation for N-dimensional unpooling layer. x_nd = chainer.Variable(xp.array(x_data)) y_nd = functions.unpooling_nd( x_nd, ksize, stride=stride, pad=pad, cover_all=self.cover_all) y_nd.grad = gy_data y_nd.backward() # Backward computation for two-dimensional unpooling layer. x_2d = chainer.Variable(xp.array(x_data)) y_2d = functions.unpooling_2d( x_2d, ksize, stride=stride, pad=pad, cover_all=self.cover_all) y_2d.grad = gy_data y_2d.backward() # Test that the two result gradients are close enough. opt = self.check_backward_options testing.assert_allclose( x_nd.grad, x_2d.grad, atol=opt['atol'], rtol=opt['rtol'])
def backward(self, inputs, grad_outputs): expander = self.expander x, gamma, gy = inputs gx1, ggamma1, _ = self.output_data ggx1, gggamma1, ggbeta1 = grad_outputs xp = backend.get_array_module(x) # auxiliary values inv_m = gamma.dtype.type(1. / (x.size // gamma.size)) r = 0 if ggx1 is None else (gx1 * ggx1).sum(axis=self.axis) coeff = gamma * self.inv_std coeff_m = coeff * inv_m x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander]) # handle None in output gradients ggx1 = _zero_if_none(xp, ggx1, x.shape, x.dtype) gggamma1 = _zero_if_none(xp, gggamma1, gamma.shape, gamma.dtype) ggbeta1 = _zero_if_none(xp, ggbeta1, gamma.shape, gamma.dtype) gggamma2 = gggamma1 - coeff_m * (x_hat * ggx1).sum(axis=self.axis) ggbeta2 = ggbeta1 - coeff_m * ggx1.sum(axis=self.axis) ggamma2 = r / gamma gx_hat2 = (gggamma2[expander] * gy - (coeff_m * ggamma1)[expander] * ggx1) gstd2 = -self.inv_std * (r + (x_hat * gx_hat2).sum(axis=self.axis)) gmean2 = -self.inv_std * gx_hat2.sum(axis=self.axis) gx2 = self.inv_std[expander] * gx_hat2 + inv_m * ( gmean2[expander] + x_hat * gstd2[expander]) ggy2 = (gggamma2[expander] * x_hat + ggbeta2[expander] + coeff[expander] * ggx1) return gx2, ggamma2, ggy2
def check_backward(self, x_data, W_data, b_data, y_grad): xp = backend.get_array_module(x_data) if not self.c_contiguous: x_data = xp.asfortranarray(x_data) W_data = xp.asfortranarray(W_data) y_grad = xp.asfortranarray(y_grad) self.assertFalse(x_data.flags.c_contiguous) self.assertFalse(W_data.flags.c_contiguous) self.assertFalse(y_grad.flags.c_contiguous) if b_data is not None: b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype) b[::2] = b_data b_data = b[::2] self.assertFalse(b_data.flags.c_contiguous) args = (x_data, W_data) if b_data is not None: args = args + (b_data,) def f(*args): return F.dilated_convolution_2d(*args, stride=self.stride, pad=self.pad, dilate=self.dilate, cover_all=self.cover_all) with chainer.using_config('use_cudnn', self.use_cudnn): gradient_check.check_backward( f, args, y_grad, dtype=numpy.float64, **self.check_backward_options)
def _coo_matmul_gradsp(a, b, c_row, c_col, c_shape, transa, transb, transc, dtype): if dtype is None: dtype = numpy.result_type(a.dtype, b.dtype) if transa: A = a.swapaxes(-1, -2) else: A = a if transb: B = b.swapaxes(-1, -2) else: B = b if transc: C_row = c_col C_col = c_row else: C_row = c_row C_col = c_col xp = backend.get_array_module(A, B) if xp is numpy: return _coo_matmul_gradsp_cpu(A, B, C_row, C_col, dtype) else: return _coo_matmul_gradsp_gpu(A, B, C_row, C_col, dtype)
def forward(self, inputs): x, = inputs xp = backend.get_array_module() y0 = xp.exp(x) y1 = xp.expm1(x) self.retain_outputs((0,)) return y0, y1
def sign(x): """Elementwise sign function. For a given input :math:`x`, this function returns :math:`sgn(x)` defined as .. math:: sgn(x) = \\left \\{ \\begin{array}{cc} -1 & {\\rm if~x < 0} \\\\ 0 & {\\rm if~x = 0} \\\\ 1 & {\\rm if~x > 0} \\\\ \\end{array} \\right. .. note:: The gradient of this function is ``None`` everywhere and therefore unchains the computational graph. Args: x (~chainer.Variable): Input variable for which the sign is computed. Returns: ~chainer.Variable: Output variable. """ if isinstance(x, chainer.variable.Variable): x = x.array xp = backend.get_array_module(x) return chainer.as_variable(utils.force_array(xp.sign(x)))
def __init__(self, initializer=None, shape=None, name=None): if initializer is None: initializer = constant.NaN() elif numpy.isscalar(initializer): initializer = constant.Constant(initializer) if shape is None: if isinstance(initializer, (numpy.ndarray, cuda.ndarray)): # parameter initialized by the initial array super(Parameter, self).__init__(initializer, name=name) else: # uninitialized parameter super(Parameter, self).__init__(name=name) dtype = getattr(initializer, 'dtype', None) self._grad_initializer = constant.NaN(dtype) else: # parameter initialized with a given shape if isinstance(initializer, (numpy.ndarray, cuda.ndarray)): xp = backend.get_array_module(initializer) initializer = constant.Constant(initializer) else: xp = numpy data = initializers.generate_array(initializer, shape, xp) grad = xp.full_like(data, numpy.nan) super(Parameter, self).__init__(data, name=name, grad=grad) self.update_rule = None self.initializer = initializer
def backward(self, inputs, grads): xp = backend.get_array_module(*inputs) _, indices, _ = inputs g = grads[0] gv = g[xp.arange(len(indices)), indices] g[xp.arange(len(indices)), indices] = 0 return g, None, gv
def forward(self, inputs): self.retain_inputs((0, 1)) xp = backend.get_array_module(*inputs) x1, x2 = inputs difference = x1 - x2 y = xp.square(difference) return utils.force_array(y, dtype=x1.dtype),
def __call__(self, rule, param): grad = param.grad if grad is None: return xp = backend.get_array_module(grad) with cuda.get_device_from_array(grad): xp.clip(grad, self.lower_bound, self.upper_bound, out=grad)
def forward(self, inputs): # Channels-first is Chainer's tensor format # W is 6-dimensional x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None stride_row, stride_col = self.sy, self.sx output_row, output_col = W.shape[1], W.shape[2] feature_dim = W.shape[3] * W.shape[4] * W.shape[5] xp = backend.get_array_module(*inputs) output = xp.empty((x.shape[0], W.shape[0], output_row, output_col,), dtype=x.dtype) for i in moves.range(output_row): for j in moves.range(output_col): slice_row = slice(i * stride_row, i * stride_row + W.shape[4]) slice_col = slice(j * stride_col, j * stride_col + W.shape[5]) x_flatten = xp.reshape(x[..., slice_row, slice_col], (-1, feature_dim)) W_flatten = xp.reshape(W[:, i, j, ...], (-1, feature_dim)) output[..., i, j] = xp.dot(x_flatten, W_flatten.T) if b is not None: output += b[None, :, :, :] self.retain_inputs((0, 1)) # only retain x and W return output,
def backward(self, inputs, grad_outputs): x, gamma, _ = inputs gy = grad_outputs[0] head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim) m = gamma.dtype.type(x.size // gamma.size) axis = (0,) + tuple(range(head_ndim, x.ndim)) xp = backend.get_array_module(x) # Note: we must be in train mode. assert configuration.config.train # NOTE(tommi): cuDNN is not used since it does not support # batch renormalization gbeta = gy.sum(axis=axis) ggamma = (gy * self.x_hat_renorm).sum(axis=axis) gsigma_batch = (gy * self.x_hat).sum(axis=axis) if xp is numpy: scale = (self.r * gamma / self.std)[expander] gx = scale * (gy - (self.x_hat * gsigma_batch[expander] + gbeta[expander]) / m) else: inv_m = numpy.float32(1) / m gx = cuda.elementwise( 'T gy, T x_hat, T gamma, T std, T gsigma_batch, T gbeta, \ T inv_m, T r', 'T gx', 'gx = (r * gamma / std) * (gy - (x_hat * gsigma_batch + gbeta) * \ inv_m)', 'bn_bwd')(gy, self.x_hat, gamma[expander], self.std[expander], gsigma_batch[expander], gbeta[expander], inv_m, self.r[expander]) return gx, ggamma, gbeta
def check_backward_consistency_regression(self, x_data, gy_data): # Regression test to two-dimensional unpooling layer. ndim = len(self.dims) if ndim != 2: return ksize = self.ksize stride = self.stride pad = self.pad xp = backend.get_array_module(x_data) # Backward computation for N-dimensional unpooling layer. x_nd = chainer.Variable(xp.array(x_data)) y_nd = functions.unpooling_nd( x_nd, ksize, stride=stride, pad=pad, cover_all=self.cover_all) y_nd.grad = gy_data y_nd.backward() # Backward computation for two-dimensional unpooling layer. x_2d = chainer.Variable(xp.array(x_data)) y_2d = functions.unpooling_2d( x_2d, ksize, stride=stride, pad=pad, cover_all=self.cover_all) y_2d.grad = gy_data y_2d.backward() # Test that the two result gradients are close enough. opt = self.check_backward_options testing.assert_allclose( x_nd.grad, x_2d.grad, atol=opt['atol'], rtol=opt['rtol'])
def check_LARS(self): w0 = self.target[0].param.data g0 = self.target[0].param.grad w1 = self.target[1].param.data g1 = self.target[1].param.grad xp = backend.get_array_module(w0) threshold = 1e-2 weight_decay = 0.2 eps = 1e-9 p0_norm = xp.linalg.norm(w0) g0_norm = xp.linalg.norm(g0) clip_rate = p0_norm / (eps + g0_norm + weight_decay * p0_norm) expect0 = w0 - clip_rate * (g0 + weight_decay * w0) expect1 = w1 - 1.0 * (g1 + weight_decay * w1) opt = optimizers.SGD(lr=1) opt.setup(self.target) opt.add_hook(optimizer_hooks.GradientLARS(threshold=threshold, weight_decay=weight_decay, eps=eps)) opt.update() testing.assert_allclose(expect0, w0) testing.assert_allclose(expect1, w1)
def backward(self, indexes, grad_outputs): x0, x1, y = self.get_retained_inputs() gy, = grad_outputs xp = backend.get_array_module(gy.data) # Recompute intermediate variables as in forward. diff = x0 - x1 dist_sq = chainer.functions.sum(diff ** 2, axis=1) dist = chainer.functions.sqrt(dist_sq) mdist = self.margin - dist y = y.data x_dim = x0.shape[1] y = xp.repeat(y[:, None], x_dim, axis=1) if self.reduce == 'mean': alpha = gy / y.shape[0] else: alpha = gy[:, None] alpha = chainer.functions.broadcast_to(alpha, y.shape) dist = chainer.functions.repeat(dist[:, None], x_dim, axis=1) # avoid division by zero, 1e-7 is sufficiently small value that can be # represented even in half precision dist = chainer.functions.maximum( dist, xp.full(dist.shape, 1e-7, dtype=dist.dtype)) # similar pair gx0 = alpha * y.astype(alpha.dtype) * diff # dissimilar pair d = chainer.functions.repeat(mdist[:, None], x_dim, axis=1) mdist = chainer.functions.maximum( d, xp.zeros(shape=d.shape, dtype=d.dtype)) gx0 += alpha * (1 - y) * mdist * -(diff / dist) gx0 = chainer.functions.cast(gx0, x0.dtype) return gx0, -gx0, None
def forward(self, inputs): self.retain_inputs((0, 1)) xp = backend.get_array_module(*inputs) x, t = inputs self.ignore_mask = (t != self.ignore_label) # stable computation of the cross entropy. loss = -( self.ignore_mask * (x * (t - (x >= 0)) - xp.log1p(xp.exp(-xp.abs(x))))) if not self.reduce == 'mean': return utils.force_array(loss.astype(x.dtype)), if self.normalize: count = xp.maximum(1, self.ignore_mask.sum()) else: count = max(1, len(x)) self.count = count # TODO(takagi): Fix to perform division in a specific dtype. See # cupy/cupy#1534. return utils.force_array( xp.divide(xp.sum(loss), self.count), dtype=x.dtype),
def __call__(self, trainer): if self.available(): # Dynamically import pyplot to call matplotlib.use() # after importing chainer.training.extensions import matplotlib.pyplot as plt else: return xp = backend.get_array_module(self._vars[0].data) stats = xp.zeros(self._data_shape, dtype=xp.float32) for i, k in enumerate(self._keys): xs = [] for var in self._vars: x = getattr(var, k, None) if x is not None: xs.append(x.ravel()) if len(xs) > 0: stat_dict = self._statistician( xp.concatenate(xs, axis=0), axis=0, xp=xp) stat_list = [] if self._plot_mean: stat_list.append(xp.atleast_1d(stat_dict['mean'])) if self._plot_std: stat_list.append(xp.atleast_1d(stat_dict['std'])) if self._plot_percentile: stat_list.append(xp.atleast_1d(stat_dict['percentile'])) stats[i] = xp.concatenate(stat_list, axis=0) if xp != numpy: stats = cuda.to_cpu(stats) self._samples.add(stats, idx=trainer.updater.iteration) if self._trigger(trainer): file_path = os.path.join(trainer.out, self._file_name) self.save_plot_using_module(file_path, plt)
def _sample_directions(self): # Samples a direction vector (list of arrays with the same shapes as # input arrays and parameters) x_data = self.x_data params = self.params no_grads = self.no_grads xp = backend.get_array_module(*x_data) direction_xs_shapes = [ x.shape for x, no_grad in six.moves.zip(x_data, no_grads) if not no_grad] direction_param_shapes = [p.shape for p in params] direction_shapes = direction_xs_shapes + direction_param_shapes directions = [ xp.random.normal(size=shape) for shape in direction_shapes] # The direction vector is normalized in order to keep the scale of # differentiation error invariant with respect to the number of input # dimensions. Ideally, the scale of the curvature with respect to each # input dimension should be taken into account, but we ignore the # differences and assume that the curvature is uniform with respect to # all the input dimentions. norm = math.sqrt(sum([xp.square(d).sum() for d in directions])) if norm != 0: # norm could be zero if input arrays are 0-sized. scale = 1. / norm directions = [d * scale for d in directions] return directions
def forward(self, inputs): x, = inputs xp = backend.get_array_module(x) B, C, H, W = x.shape u_1d = xp.linspace(0, W - 1, num=self.out_W) v_1d = xp.linspace(0, H - 1, num=self.out_H) grid = xp.meshgrid(u_1d, v_1d) # u, v are of shape (out_H * out_W,) u = grid[0].ravel() v = grid[1].ravel() # indices of the 2x2 pixel neighborhood surrounding the coordinates u0 = xp.floor(u).astype(numpy.int32) u0 = u0.clip(0, W - 2) u1 = u0 + 1 v0 = xp.floor(v).astype(numpy.int32) v0 = v0.clip(0, H - 2) v1 = v0 + 1 # weights w1 = (u1 - u) * (v1 - v) w2 = (u - u0) * (v1 - v) w3 = (u1 - u) * (v - v0) w4 = (u - u0) * (v - v0) w1 = w1.astype(x.dtype, copy=False) w2 = w2.astype(x.dtype, copy=False) w3 = w3.astype(x.dtype, copy=False) w4 = w4.astype(x.dtype, copy=False) y = (w1[None, None, :] * x[:, :, v0, u0] + w2[None, None, :] * x[:, :, v0, u1] + w3[None, None, :] * x[:, :, v1, u0] + w4[None, None, :] * x[:, :, v1, u1]) y = y.reshape(B, C, self.out_H, self.out_W) return y,
def forward(self, inputs): n_args = len(inputs) # TODO(kataoka): Do not retain inputs if n_args == 1 self.retain_inputs(tuple(range(n_args))) xp = backend.get_array_module(inputs[0]) dtype = xp.result_type(*[x.dtype for x in inputs]) out_set = set(self.out_sub) # '@' is a single char, ',' is excluded. io_set = out_set.intersection(set(self.in_subs)) if len(io_set) == len(self.out_sub): y = _einsum(xp, dtype, self.in_subs, self.out_sub, *inputs) else: direct_sub = [] inverse_sub = [] expander = [] for c in sorted(out_set): if c in io_set: direct_sub.append(c) expander.append(slice(None)) else: expander.append(None) inverse_sub.append(c) y = xp.zeros(self.out_shape, dtype) diag_y = _einsum(xp, dtype, self.out_sub, ''.join(inverse_sub), y) if diag_y.base is not y: raise ValueError('Update CuPy to close CuPy Issue #1199') # Make the view writeable as numpy PR #5410 for numpy<1.10. if xp is not cuda.cupy: # no setflags in cupy diag_y.setflags(write=True) diag_y[...] = _einsum(xp, dtype, self.in_subs, ''.join(direct_sub), *inputs)[tuple(expander)] return y,
def forward(self, inputs): self.retain_inputs((0,)) xp = backend.get_array_module(*inputs) x, gy = inputs self._gy_shape = gy.shape gW = xp.zeros(self.w_shape, dtype=gy.dtype) if xp is numpy: # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is # too slow. for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)): if ix == self.ignore_label: continue gW[ix] += igy else: utils.nondeterministic('atomicAdd') if self.ignore_label is None: cuda.elementwise( 'T gy, S x, S n_out', 'raw T gW', 'ptrdiff_t w_ind[] = {x, i % n_out};' 'atomicAdd(&gW[w_ind], gy)', 'embed_id_bwd')( gy, xp.expand_dims(x, -1), gW.shape[1], gW) else: cuda.elementwise( 'T gy, S x, S n_out, S ignore', 'raw T gW', ''' if (x != ignore) { ptrdiff_t w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy); } ''', 'embed_id_bwd_ignore_label')( gy, xp.expand_dims(x, -1), gW.shape[1], self.ignore_label, gW) return gW,
def average_pooling_2d(x, ksize, stride=None, pad=0): """Spatial average pooling function. This function acts similarly to :func:`~chainer.functions.convolution_2d`, but it computes the average of input spatial patch for each channel without any parameter instead of computing the inner products. Args: x (~chainer.Variable): Input variable. ksize (int or pair of ints): Size of pooling window. ``ksize=k`` and ``ksize=(k, k)`` are equivalent. stride (int or pair of ints or None): Stride of pooling applications. ``stride=s`` and ``stride=(s, s)`` are equivalent. If ``None`` is specified, then it uses same stride as the pooling window size. pad (int or pair of ints): Spatial padding width for the input array. ``pad=p`` and ``pad=(p, p)`` are equivalent. Returns: ~chainer.Variable: Output variable. .. note:: This function currently does not support ``cover_all`` mode as :func:`max_pooling_2d`. Average pooling runs in non-cover-all mode. .. note:: The values in the padded region is treated as 0, leading the averages biased towards zero. To obtain unbiased averages, use :func:`average_pooling_nd` with ``pad_value=None``. """ if backend.get_array_module(x) is chainerx: return average_pooling_nd.average_pooling_nd(x, ksize, stride, pad) return AveragePooling2D(ksize, stride, pad, False).apply((x, ))[0]
def __call__(self, rule, param): p, g = param.data, param.grad if p is None or g is None: return xp = backend.get_array_module(p) # weight norm p_norm = xp.linalg.norm(p) # grad norm g_norm = xp.linalg.norm(g) local_rate = p_norm / (self.eps + g_norm + self.weight_decay * p_norm) rate = xp.where(p_norm > self.threshold, local_rate, 1.0) with cuda.get_device_from_array(p) as dev: if int(dev) == -1: g += self.weight_decay * p g *= rate else: kernel = cuda.elementwise( 'T p, T rate, T weight_decay', 'T g', 'g += weight_decay * p; g *= rate;', 'lars') kernel(p, rate, self.weight_decay, g)
def forward(self, inputs): self.retain_inputs((0, 1, 2, 4)) x, gamma, mean, var, gy = inputs expander = self.expander xp = backend.get_array_module(x) if self.inv_std is None or self.inv_var is None: self.inv_var = xp.reciprocal(var + self.eps) self.inv_std = xp.sqrt(self.inv_var, dtype=self.inv_var.dtype) self.gamma_over_std = gamma * self.inv_std x_hat = _x_hat(x, mean[expander], self.inv_std[expander]) gx = self.gamma_over_std[expander] * gy gbeta = gy.sum(axis=self.axis, dtype=gamma.dtype) ggamma = (x_hat * gy).sum(axis=self.axis) gmean = -self.gamma_over_std * gbeta gvar = - 0.5 * self.inv_var * ( gamma * ggamma).astype(var.dtype, copy=False) gx = gx.astype(dtype=x.dtype) self.retain_outputs((0, 1, 2, 3, 4)) return gx, ggamma, gbeta, gmean, gvar
def forward(self, inputs): self.retain_inputs((0, 1)) xp = backend.get_array_module(*inputs) x, t = inputs self.ignore_mask = (t != self.ignore_label) # stable computation of the cross entropy. loss = -(self.ignore_mask * (x * (t - (x >= 0)) - xp.log1p(xp.exp(-xp.abs(x))))) if not self.reduce == 'mean': return utils.force_array(loss.astype(x.dtype)), if self.normalize: count = xp.maximum(1, self.ignore_mask.sum()) else: count = max(1, len(x)) self.count = count # TODO(takagi): Fix to perform division in a specific dtype. See # cupy/cupy#1534. return utils.force_array(xp.divide(xp.sum(loss), self.count), dtype=x.dtype),
def forward(self, inputs): self.retain_inputs((1,)) x, t = inputs self._in_shape = x.shape self._in_dtype = x.dtype if chainer.is_debug(): if not ((0 <= t).all() and (t < x.shape[1]).all()): msg = 'Each label `t` need to satisfty `0 <= t < x.shape[1]`' raise ValueError(msg) xp = backend.get_array_module(x) if xp is numpy: # This code is equivalent to `t.choose(x.T)`, but `numpy.choose` # does not work when `x.shape[1] > 32`. return x[six.moves.range(t.size), t], else: y = cuda.elementwise( 'S t, raw T x', 'T y', 'int ind[] = {i, t}; y = x[ind];', 'getitem_fwd' )(t, x) return y,
def forward(self, inputs): xp = backend.get_array_module(*inputs) y, t = inputs if self.ignore_label is not None: mask = (t == self.ignore_label) ignore_cnt = mask.sum() # will always be true when the true label is ignore_label # TODO(henry0312) # If cupy.where returns indexes, we could make the code better. # Also, we would need Advanced Indexing. pred = xp.where(mask, self.ignore_label, y.argmax(axis=1).reshape(t.shape)) count = (pred == t).sum() - ignore_cnt total = t.size - ignore_cnt if total == 0: return xp.asarray(0.0, dtype=y.dtype), else: return xp.asarray(float(count) / total, dtype=y.dtype), else: pred = y.argmax(axis=1).reshape(t.shape) return xp.asarray((pred == t).mean(dtype=y.dtype)),
def lstm_grad_grad(c_prev, a, i, f, o, c, gc, gh, ggc_prev, gga, ggi, ggf, ggo, gc_prev, ga, gi, gf, go, gc_next, ggc, ggh): xp = backend.get_array_module(a) sig_o = _sigmoid(o, xp) gsig_o = _grad_sigmoid(sig_o) ggsig_o = _grad_grad_sigmoid(sig_o) sig_i = _sigmoid(i, xp) gsig_i = _grad_sigmoid(sig_i) ggsig_i = _grad_grad_sigmoid(sig_i) sig_f = _sigmoid(f, xp) gsig_f = _grad_sigmoid(sig_f) ggsig_f = _grad_grad_sigmoid(sig_f) tanh_a = xp.tanh(a) gtanh_a = _grad_tanh(tanh_a) ggtanh_a = _grad_grad_tanh(tanh_a, gtanh_a) tanh_c = xp.tanh(c) gtanh_c = _grad_tanh(tanh_c) ggtanh_c = _grad_grad_tanh(tanh_c, gtanh_c) gc_bar = gh * sig_o * gtanh_c + gc gc_prev[:] = ggf * gc_bar * gsig_f ga[:] = (gga * sig_i * ggtanh_a + ggi * gtanh_a * gsig_i) * gc_bar gi[:] = (gga * gtanh_a * gsig_i + ggi * tanh_a * ggsig_i) * gc_bar gf[:] = (ggc_prev * (gh * sig_o * gtanh_c + gc) * gsig_f + ggf * gc_bar * c_prev * ggsig_f) ggc[:] = (ggc_prev * sig_f + gga * sig_i * gtanh_a + ggi * tanh_a * gsig_i + ggf * c_prev * gsig_f) dgc_do = gh * gsig_o * gtanh_c go[:] = ggc * dgc_do + ggo * gh * tanh_c * ggsig_o dgc_dc = gh * sig_o * ggtanh_c gc_next[:] = ggc * dgc_dc + ggo * gh * gtanh_c * gsig_o ggh[:] = ggc * sig_o * gtanh_c + ggo * tanh_c * gsig_o return gc_prev, ga, gi, gf, go, gc_next, ggc, ggh
def backward(self, inputs, grads): gpu = backend.get_array_module(*inputs) is not numpy # TODO(unno): We can remove redundant gpu-cpu copy using # theano.sandbox.cuda.basic_ops.gpu_from_host args = [cuda.to_cpu(x) for x in inputs + grads] outputs = self.backward_func(*args) assert len(outputs) == len(inputs) if gpu: # TODO(unno): We can remove redundant gpu-cpu copy using # theano.sandbox.cuda.CudaNdarray.gpudata device = cuda.get_device_from_array(inputs) outputs = [cuda.to_gpu(x, device) for x in outputs] results = [] for o, i in zip(outputs, inputs): if i.dtype.kind != 'f': o = None elif o.dtype != i.dtype: o = o.astype(i.dtype) results.append(o) return tuple(results)
def variable_str(var): """Return the string representation of a variable. Args: var (~chainer.Variable): Input Variable. .. seealso:: numpy.array_str """ xp = backend.get_array_module(var) if xp is numpy: arr = var.data else: arr = var.data.get() if var.name: prefix = 'variable ' + var.name else: prefix = 'variable' if arr is None: lst = 'None' else: lst = numpy.array2string(arr, None, None, None, ' ', prefix + '(') return '%s(%s)' % (prefix, lst)
def _sample_directions(self): # Samples a direction vector (list of arrays with the same shapes as # input arrays and parameters) x_data = self.x_data params = self.params no_grads = self.no_grads xp = backend.get_array_module(*x_data) direction_xs_shapes = [ x.shape for x, no_grad in six.moves.zip(x_data, no_grads) if not no_grad ] direction_param_shapes = [p.shape for p in params] direction_shapes = direction_xs_shapes + direction_param_shapes if self.is_chainerx: directions = [ xp.random.normal(size=shape, device=x_data[0].device) for shape in direction_shapes ] else: directions = [ xp.random.normal(size=shape) for shape in direction_shapes ] # The direction vector is normalized in order to keep the scale of # differentiation error invariant with respect to the number of input # dimensions. Ideally, the scale of the curvature with respect to each # input dimension should be taken into account, but we ignore the # differences and assume that the curvature is uniform with respect to # all the input dimentions. norm = math.sqrt(sum([xp.square(d).sum() for d in directions])) if norm != 0: # norm could be zero if input arrays are 0-sized. scale = 1. / norm directions = [d * scale for d in directions] return directions
def warp_perspective(image, mat): """Warp images with perspective transformation Args: image (:class `~chainer.Variable` or :ref:`ndarray`): A 4-D array of shape `(B, C, H, W)`. mat (:class `~chainer.Variable` or :ref:`ndarray`): Perspective transformaion matrices. A 3-D array of shape `(B, 3, 3)`. Returns: ~chainer.Variable: Warped image. A 4-D array of shape `(B, C, H, W)`. """ xp = backend.get_array_module(image) B, _, H, W = image.shape ps1 = pixel_coords(xp, H, W, mat.dtype).reshape(1, 2, -1) num = affine(mat[:,:2,:2], mat[:,:2,2], ps1) denom = affine(mat[:,2,:2].reshape(-1, 1, 2), mat[:,2,2].reshape(-1, 1), ps1) ps0 = num / denom return warp_dense(image, ps0.reshape(1, 2, H, W))
def _is_c_order(row, col): """Check if a coo matrix with given row and col is c_order""" if row.shape != col.shape: raise ValueError('shape of row and col must be the same.') if row.ndim != 1: for i in range(row.shape[0]): if not _is_c_order(row[i], col[i]): return False return True xp = backend.get_array_module(row) _row = row[col >= 0] _col = col[row >= 0] if _row[_row < 0].size > 0 or _col[_col < 0].size: raise ValueError('invalid index combination of row and col.') if _row.shape[0] <= 1: return True row_diff = xp.zeros(_row.shape, dtype=_row.dtype) row_diff[1:] = _row[1:] - _row[:-1] if xp.amin(row_diff) < 0: return False col_diff = xp.zeros(_col.shape, dtype=_col.dtype) col_diff[1:] = _col[1:] - _col[:-1] col_diff[(row_diff > 0)] = 0 return xp.amin(col_diff) >= 0
def check_backward(self, x_data, W_data, b_data, y_grad, use_cudnn='never'): xp = backend.get_array_module(x_data) if not self.c_contiguous: x_data = xp.asfortranarray(x_data) W_data = xp.asfortranarray(W_data) y_grad = xp.asfortranarray(y_grad) self.assertTrue(x_data.flags.f_contiguous) self.assertTrue(W_data.flags.f_contiguous) self.assertTrue(y_grad.flags.f_contiguous) if b_data is not None: b = xp.empty((len(b_data) * 2, ), dtype=b_data.dtype) b[::2] = b_data b_data = b[::2] self.assertFalse(b_data.flags.c_contiguous) args = (x_data, W_data) if b_data is not None: args += (b_data, ) def f(*args): return F.convolution_nd(*args, stride=self.stride, pad=self.pad, cover_all=self.cover_all, dilate=self.dilate, groups=self.groups) with chainer.using_config('use_cudnn', use_cudnn): with chainer.using_config('autotune', self.autotune): gradient_check.check_backward(f, args, y_grad, **self.check_backward_options)
def _crs_matmul(sp_data, sp_row, sp_col, sp_shape, dn, transa, transb, transc, dtype=None): if dtype is None: dtype = numpy.result_type(sp_data.dtype, dn.dtype) A_data = sp_data if transa: A_row = sp_col A_col = sp_row A_shape = (sp_shape[1], sp_shape[0]) # TODO(denjiry): write something instead of order else: A_row = sp_row A_col = sp_col A_shape = sp_shape if transb: B = dn.swapaxes(-1, -2) else: B = dn xp = backend.get_array_module(A_data, B) if xp is numpy: C = _crs_matmul_cpu(A_data, A_row, A_col, A_shape, B, dtype) else: C = _crs_matmul_gpu(A_data, A_row, A_col, A_shape, B, dtype) if transc: C = C.swapaxes(-1, -2) return C
def __call__(self, trainer): if self.available(): # Dynamically import pyplot to call matplotlib.use() # after importing chainer.training.extensions import matplotlib.pyplot as plt else: return xp = backend.get_array_module(self._vars[0].data) stats = xp.zeros(self._data_shape, dtype=xp.float32) for i, k in enumerate(self._keys): xs = [] for var in self._vars: x = getattr(var, k, None) if x is not None: xs.append(x.ravel()) if len(xs) > 0: stat_dict = self._statistician(xp.concatenate(xs, axis=0), axis=0, xp=xp) stat_list = [] if self._plot_mean: stat_list.append(xp.atleast_1d(stat_dict['mean'])) if self._plot_std: stat_list.append(xp.atleast_1d(stat_dict['std'])) if self._plot_percentile: stat_list.append(xp.atleast_1d(stat_dict['percentile'])) stats[i] = xp.concatenate(stat_list, axis=0) if xp != numpy: stats = cuda.to_cpu(stats) self._samples.add(stats, idx=trainer.updater.iteration) if self._trigger(trainer): file_path = os.path.join(trainer.out, self._file_name) self.save_plot_using_module(file_path, plt)
def position_encode(embed, sentences): """Position encoding. It is defined as: .. math:: m = \\sum_j l_j A x_j, where :math:`A` is an embed matrix, :math:`x_j` is :math:`j`-th word ID and .. math:: l_{kj} = (1 - j / J) - (k / d)(1 - 2j / J). :math:`J` is length of a sentence and :math:`d` is the dimension of the embedding. """ xp = backend.get_array_module(sentences) e = embed(sentences) n_words, n_units = e.shape[-2:] # To avoid 0/0, we use max(length, 1) here. # Note that when the length is zero, its embedding is always zero and # is ignored. length = xp.maximum(xp.sum((sentences != 0).astype(xp.float32), axis=-1), 1) length = length.reshape((length.shape + (1, 1))) k = xp.arange(1, n_units + 1, dtype=numpy.float32) / n_units i = xp.arange(1, n_words + 1, dtype=numpy.float32)[:, None] coeff = (1 - i / length) - k * (1 - 2.0 * i / length) e = coeff * e s = F.sum(e, axis=-2) return s
def backward(self, indexes, grad_outputs): F = chainer.functions expander = self.expander x, gamma, gy = self.get_retained_inputs() gx1, ggamma1 = self.get_retained_outputs() ggx1, gggamma1, ggbeta1 = grad_outputs xp = backend.get_array_module(x) # auxiliary values inv_m = gamma.dtype.type(1. / (x.size // gamma.size)) r = 0 if ggx1 is None else F.sum(gx1 * ggx1, axis=self.axis) coeff = gamma * self.inv_std coeff_m = coeff * inv_m x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander]) # handle None in output gradients ggx1 = _zero_if_none(xp, ggx1, x.shape, x.dtype) gggamma1 = _zero_if_none(xp, gggamma1, gamma.shape, gamma.dtype) ggbeta1 = _zero_if_none(xp, ggbeta1, gamma.shape, gamma.dtype) gggamma2 = gggamma1 - coeff_m * F.sum(x_hat * ggx1, axis=self.axis) ggbeta2 = ggbeta1 - coeff_m * F.sum(ggx1, axis=self.axis) ggamma2 = r / gamma gx_hat2 = (gggamma2[expander] * gy - (coeff_m * ggamma1)[expander] * ggx1) gstd2 = -self.inv_std * (r + F.sum(x_hat * gx_hat2, axis=self.axis)) gmean2 = -self.inv_std * F.sum(gx_hat2, axis=self.axis) gx2 = self.inv_std[expander] * gx_hat2 + inv_m * ( gmean2[expander] + x_hat * gstd2[expander]) ggy2 = (gggamma2[expander] * x_hat + ggbeta2[expander] + coeff[expander] * ggx1) return gx2, ggamma2, ggy2
def forward(self, inputs): x, = inputs self._in_shape = x.shape xp = backend.get_array_module(x) return xp.cumsum(x, axis=self.axis),
def forward(self, inputs): xp = backend.get_array_module(*inputs) x, gamma, beta = inputs # Note: we must be in train mode. assert configuration.config.train if not self.update_statistics: self._running_mean = xp.array(self._running_mean) self._running_var = xp.array(self._running_var) head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim) # NOTE(tommi): cuDNN is not used since it does not support # batch renormalization axis = (0,) + tuple(range(head_ndim, x.ndim)) mean = x.mean(axis=axis) var = x.var(axis=axis) + self.eps self.std = xp.sqrt(var, dtype=var.dtype) running_sigma = xp.sqrt(self._running_var + self.eps, dtype=self._running_mean.dtype) self.r = xp.clip(self.std / running_sigma, 1.0 / self.rmax, self.rmax) d = xp.clip( (mean - self._running_mean) / running_sigma, -self.dmax, self.dmax) # Update running statistics: m = x.size // gamma[expander].size self._running_mean *= self.decay adjust = m / max(m - 1., 1.) # unbiased estimation temp_ar = xp.array(mean) temp_ar *= (1 - self.decay) self._running_mean += temp_ar del temp_ar self._running_var *= self.decay temp_ar = xp.array(var) temp_ar *= (1 - self.decay) * adjust self._running_var += temp_ar del temp_ar gamma = gamma[expander] beta = beta[expander] if xp is numpy: self.x_hat = _xhat(x, mean, self.std, expander) self.x_hat_renorm = self.x_hat * self.r[expander] + d[expander] y = gamma * self.x_hat_renorm y += beta else: self.x_hat, self.x_hat_renorm, y = cuda.elementwise( 'T x, T mean, T std, T gamma, T beta, T r, T d', 'T x_hat, T x_hat_renorm, T y', ''' x_hat = (x - mean) / std; x_hat_renorm = x_hat * r + d; y = gamma * x_hat_renorm + beta; ''', 'bn_fwd')(x, mean[expander], self.std[expander], gamma, beta, self.r[expander], d[expander]) return y,
def connectionist_temporal_classification(x, t, blank_symbol, input_length=None, label_length=None, reduce='mean'): """Connectionist Temporal Classification loss function. Connectionist Temporal Classification(CTC) [Graves2006]_ is a loss function of sequence labeling where the alignment between the inputs and target is unknown. See also [Graves2012]_ The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the samplewise loss values. If it is ``'mean'``, it takes the mean of loss values. Args: x (list or tuple of :class:`~chainer.Variable`): A list of unnormalized probabilities for labels. Each element of ``x``, ``x[i]`` is a :class:`~chainer.Variable` object, which has shape ``(B, V)``, where ``B`` is the batch size and ``V`` is the number of labels. The softmax of ``x[i]`` represents the probabilities of the labels at time ``i``. t (:class:`~chainer.Variable` or :ref:`ndarray`): A matrix including expected label sequences. Its shape is ``(B, M)``, where ``B`` is the batch size and ``M`` is the maximum length of the label sequences. All elements in ``t`` must be less than ``V``, the number of labels. blank_symbol (int): Index of blank_symbol. This value must be non-negative. input_length (:class:`~chainer.Variable` or :ref:`ndarray`): Length of sequence for each of mini batch ``x`` (optional). Its shape must be ``(B,)``. If the ``input_length`` is omitted or ``None``, it assumes that all of ``x`` is valid input. label_length (:class:`~chainer.Variable` or :ref:`ndarray`): Length of sequence for each of mini batch ``t`` (optional). Its shape must be ``(B,)``. If the ``label_length`` is omitted or ``None``, it assumes that all of ``t`` is valid input. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding a scalar value of the CTC loss. If ``reduce`` is ``'no'``, the output variable holds array whose shape is `(B,)` where `B` is the number of samples. If it is ``'mean'``, it holds a scalar. .. note:: You need to input ``x`` without applying to activation functions(e.g. softmax function), because this function applies softmax functions to ``x`` before calculating CTC loss to avoid numerical limitations. You also need to apply softmax function to forwarded values before you decode it. .. note:: This function is differentiable only by ``x``. .. note:: This function supports (batch, sequence, 1-dimensional input)-data. .. [Graves2006] Alex Graves, Santiago Fernandez,\ Faustino Gomez, Jurgen Schmidhuber,\ `Connectionist Temporal Classification: Labelling Unsegmented\ Sequence Data with Recurrent Neural Networks\ <ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf>`_ .. [Graves2012] Alex Graves,\ `Supervised Sequence Labelling with Recurrent Neural Networks\ <https://www.cs.toronto.edu/~graves/preprint.pdf>`_ """ if not isinstance(x, collections_abc.Sequence): raise TypeError('x must be a list of Variables') if not isinstance(blank_symbol, int): raise TypeError('blank_symbol must be non-negative integer.') assert 0 <= blank_symbol < x[0].shape[1] # This implementation only supports 1-dimensional data. # TODO(jnishi): Support d(>1)-dimensional inputs. assert x[0].ndim == 2 xp = backend.get_array_module(x[0]) if input_length is None: input_length = xp.full(len(x[0]), len(x), dtype=numpy.int32) if label_length is None: label_length = xp.full(len(t), t.shape[1], dtype=numpy.int32) return ConnectionistTemporalClassification(blank_symbol, reduce)( input_length, label_length, t, chainer.functions.stack(x))
def _matmul(a, b): xp = backend.get_array_module(a) if not hasattr(xp, 'matmul'): # NumPy 1.9 does not support matmul. We use einsum instead. return xp.einsum('ijl,ilk->ijk', a, b) return xp.matmul(a, b)
def forward(self, inputs): x, = inputs xp = backend.get_array_module(x) y = xp.zeros(self.mask.shape, x.dtype) y[self.mask] = x return y,
def forward(self, inputs): # may broadcast xp = backend.get_array_module(*inputs) x, y = inputs condition = self.condition return xp.where(condition, x, y),
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this function supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This function returns a tuple containing three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA if kwargs: argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = backend.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = cuda.get_cudnn_dropout_states() states.set_dropout_ratio(dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) rnn_mode = 'rnn_%s' % activation w = cudnn_rnn_weight_concat(n_layers, states, use_bi_direction, rnn_mode, ws, bs) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh elif activation == 'relu': rnn = NStepBiRNNReLU else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh elif activation == 'relu': rnn = NStepRNNReLU hy, ys = rnn(n_layers, states, lengths)(hx, w, xs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, ys else: def f(x, h, c, w, b): xw, hw = w xb, hb = b rnn_in = linear.linear(x, xw, xb) + linear.linear(h, hw, hb) if activation == 'tanh': return tanh.tanh(rnn_in), None elif activation == 'relu': return relu.relu(rnn_in), None hy, _, ys = n_step_rnn_impl(f, n_layers, dropout_ratio, hx, None, ws, bs, xs, use_bi_direction) return hy, ys
def forward(self, inputs): # output a dummy array. xp = backend.get_array_module(inputs[0]) return xp.empty((0, ), dtype=numpy.float32),
def _ones_like(arr): device = cuda.get_device_from_array(arr) xp = backend.get_array_module(arr) with device: return xp.ones_like(arr)
def forward(self, xs): xp = backend.get_array_module(*xs) return xp.hstack(xs),