def copydata(self, var): """Copies the data array from given source variable. This method copies the data array from given variable to this variable. The copy is done even if the arrays reside on different devices, including across the host and a GPU device. If this variable has an uninitialized data array, this method initializes it by the data array of the given variable. Similarly, if the given variable has an uninitialized data array, this method initializes it by the data array of this variable (``self``). If both are uninitialized, this method does nothing. Args: var (Variable): Source variable. """ src = var.data dst = self.data if src is None: if dst is None: return var.initialize(self.shape) src = var.data elif dst is None: self.initialize(src.shape) dst = self.data src_xp = cuda.get_array_module(src) dst_xp = cuda.get_array_module(dst) if dst_xp is src_xp: dst_xp.copyto(dst, src) elif dst_xp is numpy: dst_xp.copyto(dst, src.get()) else: dst.set(src)
def _preprocess_const(x, value): xp = cuda.get_array_module(x) if not numpy.isscalar(value) and cuda.get_array_module(value) != xp: # TODO(unno): We can transfer arrays automatically raise TypeError('Cannot mix cupy.ndarray and numpy.ndarray') b = xp.broadcast(x, value) if b.shape != x.shape: raise ValueError('Failed to broadcast arrays') return utils.force_type(x.dtype, value)
def check_forward(self, log_pi_data, tau): log_pi = chainer.Variable(log_pi_data) y = functions.gumbel_softmax(log_pi, tau=tau) # Only checks dtype and shape because its result contains noise self.assertEqual(y.dtype, numpy.float32) self.assertEqual(y.shape, log_pi.shape) self.assertEqual( cuda.get_array_module(y), cuda.get_array_module(log_pi))
def forward(self, xs): y = _log_softmax(xs[0], axis=self.axis) self._x_xp = cuda.get_array_module(*xs) self._x_shape = xs[0].shape self._x_dtype = xs[0].dtype self.retain_outputs((0,)) return y,
def variable_repr(var): """Return the string representation of a variable. Args: var (~chainer.Variable): Input Variable. .. seealso:: numpy.array_repr """ xp = cuda.get_array_module(var) if xp is numpy: arr = var.data else: arr = var.data.get() if var.name: prefix = 'variable ' + var.name else: prefix = 'variable' if arr is None: lst = 'None' elif arr.size > 0 or arr.shape == (0,): lst = numpy.array2string(arr, None, None, None, ', ', prefix + '(') else: # show zero-length shape unless it is (0,) lst = '[], shape=%s' % (repr(arr.shape),) return '%s(%s)' % (prefix, lst)
def forward(self, inputs): self.retain_inputs(tuple(range(len(inputs)))) e1 = _as_mat(inputs[0]) e2 = _as_mat(inputs[1]) W, gy = inputs[2], inputs[-1] xp = cuda.get_array_module(*inputs) # optimize: gW = xp.einsum('ij,ik,il->jkl', e1, e2, gy) gW = xp.einsum('ij,ik->jki', e1, e2).dot(gy) gy_W = xp.tensordot(gy, W, axes=(1, 2)) # 'il,jkl->ijk' # optimize: ge1 = xp.einsum('ik,jkl,il->ij', e2, W, gy) ge1 = xp.einsum('ik,ijk->ij', e2, gy_W) # optimize: ge2 = xp.einsum('ij,jkl,il->ik', e1, W, gy) ge2 = xp.einsum('ij,ijk->ik', e1, gy_W) ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW if len(inputs) == 6: V1, V2 = inputs[3], inputs[4] gV1 = e1.T.dot(gy) gV2 = e2.T.dot(gy) gb = gy.sum(0) ge1 += gy.dot(V1.T) ge2 += gy.dot(V2.T) ret += gV1, gV2, gb return ret
def check_double_backward( self, x_data, W_data, b_data, y_grad, x_grad_grad, W_grad_grad, b_grad_grad): args = x_data, W_data grads = x_grad_grad, W_grad_grad if b_data is not None: args += b_data, grads += b_grad_grad, if self.use_batchwise_mask: mask_shape = (x_data.shape[0],) + W_data.shape else: mask_shape = W_data.shape xp = cuda.get_array_module(x_data) mask = xp.random.rand(*mask_shape) >= self.ratio def f(x, W, b=None): return functions.simplified_dropconnect( x, W, b, self.ratio, self.train, mask, self.use_batchwise_mask) gradient_check.check_double_backward( f, args, y_grad, grads, eps=1e-2, **self.check_double_backward_options)
def check_backward(self, x_data, W_data, b_data, y_grad, use_cudnn='never'): xp = cuda.get_array_module(x_data) if not self.c_contiguous: x_data = xp.asfortranarray(x_data) W_data = xp.asfortranarray(W_data) y_grad = xp.asfortranarray(y_grad) self.assertTrue(x_data.flags.f_contiguous) self.assertTrue(W_data.flags.f_contiguous) self.assertTrue(y_grad.flags.f_contiguous) if b_data is not None: b = xp.empty((len(b_data) * 2,), dtype=b_data.dtype) b[::2] = b_data b_data = b[::2] self.assertFalse(b_data.flags.c_contiguous) args = (x_data, W_data) if b_data is not None: args += (b_data,) def f(*args): return F.convolution_nd( *args, stride=self.stride, pad=self.pad, cover_all=self.cover_all) with chainer.using_config('use_cudnn', use_cudnn): with chainer.using_config('autotune', self.autotune): gradient_check.check_backward( f, args, y_grad, **self.check_backward_options)
def __init__(self, initializer=None, shape=None, name=None): if initializer is None: initializer = constant.NaN() elif numpy.isscalar(initializer): initializer = constant.Constant(initializer) if shape is None: if isinstance(initializer, (numpy.ndarray, cuda.ndarray)): # parameter initialized by the initial array super(Parameter, self).__init__(initializer, name=name) else: # uninitialized parameter super(Parameter, self).__init__(name=name) dtype = getattr(initializer, 'dtype', None) self._grad_initializer = constant.NaN(dtype) else: # parameter initialized with a given shape if isinstance(initializer, (numpy.ndarray, cuda.ndarray)): xp = cuda.get_array_module(initializer) initializer = constant.Constant(initializer) else: xp = numpy data = initializers.generate_array(initializer, shape, xp) grad = xp.full_like(data, numpy.nan) super(Parameter, self).__init__(data, name=name, grad=grad) self.update_rule = None self.initializer = initializer
def forward(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) xp = cuda.get_array_module(*x) if xp == numpy: flag = xp.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask else: rand = xp.random.rand(*x[0].shape, dtype=numpy.float32) self.mask, y = cuda.elementwise( 'T x, R r, T scale, T ratio', 'T mask, T y', ''' mask = (r >= ratio) * scale; y = x * mask; ''', 'dropout_fwd', )(x[0], rand, scale, self.dropout_ratio) return y,
def init_state(self, param): xp = cuda.get_array_module(param.data) with cuda.get_device_from_array(param.data): self.state['m'] = xp.zeros_like(param.data) self.state['v'] = xp.zeros_like(param.data) if self.hyperparam.amsgrad: self.state['vhat'] = xp.zeros_like(param.data)
def backward(self, indexes, grad_outputs): anchor, positive, negative = self.get_retained_inputs() N = anchor.shape[0] x_dim = anchor.shape[1] xp = cuda.get_array_module(anchor) tmp = xp.repeat(self.dist_hinge[:, None], x_dim, axis=1) mask = xp.array(tmp > 0, dtype=numpy.float32) gy, = grad_outputs if self.reduce == 'mean': g = gy / N else: g = gy[:, None] tmp = 2 * chainer.functions.broadcast_to(g, mask.shape) * mask ret = [] if 0 in indexes: ret.append(tmp * (negative - positive)) if 1 in indexes: ret.append(tmp * (positive - anchor)) if 2 in indexes: ret.append(tmp * (anchor - negative)) return ret
def forward(self, inputs): self.retain_inputs((0, 1)) scale = inputs[1].dtype.type(1. / (1 - self.ratio)) xp = cuda.get_array_module(*inputs) if self.mask is None: if self.use_batchwise_mask: mask_shape = (inputs[0].shape[0], inputs[1].shape[0], inputs[1].shape[1]) else: mask_shape = (inputs[1].shape[0], inputs[1].shape[1]) if xp == numpy: self.mask = xp.random.rand(*mask_shape) >= self.ratio else: self.mask = xp.random.rand(*mask_shape, dtype=numpy.float32) >= self.ratio elif isinstance(self.mask, variable.Variable): self.mask = self.mask.data x = _as_mat(inputs[0]) W = inputs[1] * scale * self.mask # (i)jk,ik->ij y = _matmul(W, x[:, :, None], xp) y = y.reshape(y.shape[0], y.shape[1]).astype(x.dtype, copy=False) if len(inputs) == 3: b = inputs[2] y += b return y,
def forward(self, inputs): self.retain_inputs((0, 1, 2)) x, gamma, gy = inputs expander = self.expander inv_m = gamma.dtype.type(1. / (x.size // gamma.size)) xp = cuda.get_array_module(x) if self.use_cudnn: cudnn_mode = self.mode.get_cudnn_mode() x = cuda.cupy.ascontiguousarray(x) gamma = cuda.cupy.ascontiguousarray(gamma) gy = cuda.cupy.ascontiguousarray(gy) dtype = x.dtype handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(_as4darray(x)) derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor() libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value, x_desc.value, cudnn_mode) dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) if dtype_param is not dtype: gamma = gamma.astype(dtype_param) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) ggamma = cuda.cupy.empty_like(gamma) gbeta = cuda.cupy.empty_like(gamma) libcudnn.batchNormalizationBackward( handle, cudnn_mode, one.data, zero.data, one.data, zero.data, x_desc.value, x.data.ptr, x_desc.value, gy.data.ptr, x_desc.value, gx.data.ptr, derivedBnDesc.value, gamma.data.ptr, ggamma.data.ptr, gbeta.data.ptr, self.eps, self.mean.data.ptr, self.inv_std.data.ptr) if dtype_param is not dtype: ggamma = ggamma.astype(dtype) gbeta = gbeta.astype(dtype) else: gbeta = gy.sum(axis=self.axis) x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander]) ggamma = (gy * x_hat).sum(axis=self.axis) if xp is numpy: gx = (gamma * self.inv_std)[expander] * ( gy - (x_hat * ggamma[expander] + gbeta[expander]) * inv_m) else: gx = cuda.elementwise( ''' T gy, T x_hat, T gamma, T inv_std, T ggamma, T gbeta, T inv_m ''', 'T gx', ''' gx = (gamma * inv_std) * ( gy - (x_hat * ggamma + gbeta) * inv_m) ''', 'bn_bwd')(gy, x_hat, gamma[expander], self.inv_std[expander], ggamma[expander], gbeta[expander], inv_m) self.retain_outputs((0, 1)) return gx, ggamma, gbeta
def backward(self, inputs, grad_outputs): expander = self.expander x, gamma, gy = inputs gx1, ggamma1, _ = self.output_data ggx1, gggamma1, ggbeta1 = grad_outputs xp = cuda.get_array_module(x) # auxiliary values inv_m = gamma.dtype.type(1. / (x.size // gamma.size)) r = 0 if ggx1 is None else (gx1 * ggx1).sum(axis=self.axis) coeff = gamma * self.inv_std coeff_m = coeff * inv_m x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander]) # handle None in output gradients ggx1 = _zero_if_none(xp, ggx1, x.shape, x.dtype) gggamma1 = _zero_if_none(xp, gggamma1, gamma.shape, gamma.dtype) ggbeta1 = _zero_if_none(xp, ggbeta1, gamma.shape, gamma.dtype) gggamma2 = gggamma1 - coeff_m * (x_hat * ggx1).sum(axis=self.axis) ggbeta2 = ggbeta1 - coeff_m * ggx1.sum(axis=self.axis) ggamma2 = r / gamma gx_hat2 = (gggamma2[expander] * gy - (coeff_m * ggamma1)[expander] * ggx1) gstd2 = -self.inv_std * (r + (x_hat * gx_hat2).sum(axis=self.axis)) gmean2 = -self.inv_std * gx_hat2.sum(axis=self.axis) gx2 = self.inv_std[expander] * gx_hat2 + inv_m * ( gmean2[expander] + x_hat * gstd2[expander]) ggy2 = (gggamma2[expander] * x_hat + ggbeta2[expander] + coeff[expander] * ggx1) return gx2, ggamma2, ggy2
def __call__(self, rule, param): grad = param.grad if grad is None: return xp = cuda.get_array_module(grad) with cuda.get_device_from_array(grad): xp.clip(grad, self.lower_bound, self.upper_bound, out=grad)
def xp(self): """Array module for the distribution. Depending on which of CPU/GPU this distribution is on, this property returns :mod:`numpy` or :mod:`cupy`. """ return cuda.get_array_module(*self.params.values())
def __call__(self, trainer): if _available: # Dynamically import pyplot to call matplotlib.use() # after importing chainer.training.extensions import matplotlib.pyplot as plt else: return xp = cuda.get_array_module(self._vars[0].data) stats = xp.zeros(self._data_shape, dtype=xp.float32) for i, k in enumerate(self._keys): xs = [] for var in self._vars: x = getattr(var, k, None) if x is not None: xs.append(x.ravel()) if len(xs) > 0: stat_dict = self._statistician( xp.concatenate(xs, axis=0), axis=0, xp=xp) stat_list = [] if self._plot_mean: stat_list.append(xp.atleast_1d(stat_dict['mean'])) if self._plot_std: stat_list.append(xp.atleast_1d(stat_dict['std'])) if self._plot_percentile: stat_list.append(xp.atleast_1d(stat_dict['percentile'])) stats[i] = xp.concatenate(stat_list, axis=0) if xp != numpy: stats = cuda.to_cpu(stats) self._samples.add(stats, idx=trainer.updater.iteration) if self._trigger(trainer): file_path = os.path.join(trainer.out, self._file_name) self.save_plot_using_module(file_path, plt)
def check_backward_consistency_regression(self, x_data, gy_data, use_cudnn='always'): # Regression test to two-dimensional average pooling layer. if len(self.dims) != 2: return ksize = self.ksize stride = self.stride pad = self.pad xp = cuda.get_array_module(x_data) # Backward computation for N-dimensional average pooling layer. x_nd = chainer.Variable(xp.array(x_data)) with chainer.using_config('use_cudnn', use_cudnn): func_nd = functions.AveragePoolingND(self.ndim, ksize, stride=stride, pad=pad) y_nd = func_nd.apply((x_nd,))[0] y_nd.grad = gy_data y_nd.backward() # Backward computation for two-dimensional average pooling layer. x_2d = chainer.Variable(xp.array(x_data)) with chainer.using_config('use_cudnn', use_cudnn): func_2d = functions.AveragePooling2D(ksize, stride=stride, pad=pad, cover_all=False) y_2d = func_2d.apply((x_2d,))[0] y_2d.grad = gy_data y_2d.backward() # Test that the two result gradients are close enough. testing.assert_allclose(x_nd.grad, x_2d.grad)
def backward(self, indexes, grad_outputs): x0, x1, y = self.get_retained_inputs() gy, = grad_outputs xp = cuda.get_array_module(gy.data) # Recompute intermediate variables as in forward. diff = x0 - x1 dist_sq = chainer.functions.sum(diff ** 2, axis=1) dist = chainer.functions.sqrt(dist_sq) mdist = self.margin - dist y = y.data x_dim = x0.shape[1] y = xp.repeat(y[:, None], x_dim, axis=1) if self.reduce == 'mean': alpha = gy / y.shape[0] else: alpha = gy[:, None] alpha = chainer.functions.broadcast_to(alpha, y.shape) dist = chainer.functions.repeat(dist[:, None], x_dim, axis=1) # avoid division by zero dist = chainer.functions.maximum( dist, xp.full(dist.shape, 1e-8, dtype=dist.dtype)) # similar pair gx0 = alpha * y.astype(alpha.dtype) * diff # dissimilar pair d = chainer.functions.repeat(mdist[:, None], x_dim, axis=1) mdist = chainer.functions.maximum( d, xp.zeros(shape=d.shape, dtype=d.dtype)) gx0 += alpha * (1 - y) * mdist * -(diff / dist) gx0 = chainer.functions.cast(gx0, xp.float32) return gx0, -gx0, None
def forward(self, inputs): self.retain_inputs((0, 1)) xp = cuda.get_array_module(*inputs) x, t = inputs self.ignore_mask = (t != self.ignore_label) # stable computation of the cross entropy. loss = -( self.ignore_mask * (x * (t - (x >= 0)) - xp.log1p(xp.exp(-xp.abs(x))))) if not self.reduce == 'mean': return utils.force_array(loss.astype(x.dtype)), if self.normalize: count = xp.maximum(1, self.ignore_mask.sum()) else: count = max(1, len(x)) self.count = count # TODO(takagi): Fix to perform division in a specific dtype. See # cupy/cupy#1534. return utils.force_array( xp.divide(xp.sum(loss), self.count), dtype=x.dtype),
def _forward_grouped_convolution(self, x, gy): # G: group count # N: batch size # kH, kW: kernel height, kernel width # iC, iH, iW: input channels, input height, input width # oC, oH, oW: output channels, output height, output width G = self.group kH = self.kh kW = self.kw N, iC, iH, iW = x.shape _, oC, oH, oW = gy.shape iCg = int(iC / G) oCg = int(oC / G) xp = cuda.get_array_module(x) _x = x.reshape(N, G, iCg, iH, iW) _x = xp.rollaxis(_x, 1) # (G, N, iCg, iH, iW) _gy = gy.reshape(N, G, oCg, oH, oW) _gy = xp.rollaxis(_gy, 1) # (G, N, oCg, oH, oW) # Work-around for NumPy's bug? if xp is numpy: _gy = xp.ascontiguousarray(_gy) _gWs = [] for g in six.moves.range(G): if xp is numpy: _gW = self._forward_cpu_core(_x[g, ], _gy[g, ]) else: _gW = self._forward_gpu_core(_x[g, ], _gy[g, ]) _gWs.append(_gW) gW = xp.stack(_gWs) # (G, oCg, iCg, kH, kW) gW = gW.reshape(oC, iCg, kH, kW) return gW
def _forward_grouped_convolution(self, x, W, b): # G: group count # N: batch size # kH, kW: kernel height, kernel width # xC, xH, xW: x channels, x height, x width # yC, yH, yW: y channels, y height, y width G = self.group N, xC, xH, xW = x.shape xCg = int(xC / G) _, yCg, kH, kW = W.shape xp = cuda.get_array_module(x) _x = x.reshape(N, G, xCg, xH, xW) _x = xp.rollaxis(_x, 1) # (G, N, xCg, xH, xW) _W = W.reshape(G, xCg, yCg, kH, kW) if b is not None: _b = b.reshape(G, yCg) _ys = [] for g in six.moves.range(G): _bg = None if b is None else _b[g, ] if xp is numpy: _y, = self._forward_cpu_core(_x[g, ], _W[g, ], _bg) else: _y, = self._forward_gpu_core(_x[g, ], _W[g, ], _bg) _ys.append(_y) y = xp.concatenate(_ys, axis=1) # (N, yC, yH, yW) return y,
def check_forward(self, e1_data, e2_data, W_data, V1_data, V2_data, b_data): e1 = chainer.Variable(e1_data) e2 = chainer.Variable(e2_data) W = chainer.Variable(W_data) e1_data = e1_data.reshape(e1_data.shape[0], -1) e2_data = e2_data.reshape(e2_data.shape[0], -1) xp = cuda.get_array_module(e1) y_expect = xp.einsum('ij,ik,jkl->il', e1_data, e2_data, W_data) flags = V1_data is None, V2_data is None, b_data is None if any(flags): if not all(flags): raise ValueError( 'Test either all or none of the optional parameters.') y = functions.bilinear(e1, e2, W) else: V1 = chainer.Variable(V1_data) V2 = chainer.Variable(V2_data) b = chainer.Variable(b_data) y = functions.bilinear(e1, e2, W, V1, V2, b) y_expect = xp.einsum('ij,ik,jkl->il', e1_data, e2_data, W_data) y_expect += e1_data.dot(V1_data) y_expect += e2_data.dot(V2_data) y_expect += b_data testing.assert_allclose(y_expect, cuda.to_cpu(y.data)) assert y.data.dtype == e1_data.dtype
def forward(self, x): self.retain_inputs(()) dims = x[0].shape[2:] ndim = self.ndim ksize = self.ksize stride = self.stride pad = self.pad if self.outs is None: self.outs = tuple( conv.get_deconv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in six.moves.zip(dims, ksize, stride, pad)) xp = cuda.get_array_module(*x) colon = slice(None) # (:, :, None, None, ..., None) tile_index = (colon, colon) + (None,) * ndim # (1, 1, k_1, k_2, ..., k_n, 1, 1, ..., 1) tile_reps = (1, 1) + ksize + (1,) * ndim col = xp.tile(x[0][tile_index], tile_reps) if xp is numpy: col2im_nd = conv_nd.col2im_nd_cpu else: col2im_nd = conv_nd.col2im_nd_gpu y = col2im_nd(col, stride, pad, self.outs) return y,
def __call__(self, inputs, device=None): """Convert DALI arrays to Numpy/CuPy arrays""" xp = cuda.get_array_module(self.perturbation) if xp is not cuda.cupy: self.perturbation = cuda.to_gpu(self.perturbation, device) outputs = [] for i in range(len(inputs)): x = inputs[i].as_tensor() if (isinstance(x, dali.backend_impl.TensorCPU)): x = np.array(x) if x.ndim == 2 and x.shape[1] == 1: x = x.squeeze(axis=1) if device is not None and device >= 0: x = cuda.to_gpu(x, device) elif (isinstance(x, dali.backend_impl.TensorGPU)): x_cupy = cuda.cupy.empty(shape=x.shape(), dtype=x.dtype()) # Synchronization is necessary here to avoid data corruption # because DALI and CuPy will use different CUDA streams. cuda.cupy.cuda.runtime.deviceSynchronize() # copy data from DALI array to CuPy array x.copy_to_external(ctypes.c_void_p(x_cupy.data.ptr)) cuda.cupy.cuda.runtime.deviceSynchronize() x = x_cupy if self.perturbation is not None: x = x - self.perturbation if device is not None and device < 0: x = cuda.to_cpu(x) else: raise ValueError('Unexpected object') outputs.append(x) return tuple(outputs)
def forward(self, inputs): self.retain_inputs((0,)) x, = inputs xp = cuda.get_array_module(x) norm = (xp.sqrt(xp.sum(xp.square(x), axis=self.axis, keepdims=True)) + x.dtype.type(self.eps)) return utils.force_array(x / norm),
def forward(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] xp = cuda.get_array_module(*x) if xp is numpy: self.col = conv.im2col_cpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) else: self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (D, C, KY, KX) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D) y = _matmul(c_, w_, xp).astype(x.dtype, copy=False) # (C, B*IY*IX, D) -> (B, C*D, IY, IX) y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \ .reshape((B, C * D, IY, IX)) if b is not None: y += b[None, :, None, None] return y,
def _forward_grouped_convolution(self, x, W, b): # G: group count # N: batch size # kH, kW: kernel height, kernel width # iC, iH, iW: input channels, input height, input width # oC, oH, oW: output channels, output height, output width G = self.group N, iC, iH, iW = x.shape oC, _, kH, kW = W.shape iCg = int(iC / G) oCg = int(oC / G) xp = cuda.get_array_module(x) _x = x.reshape(N, G, iCg, iH, iW) _x = xp.rollaxis(_x, 1) # (G, N, iCg, iH, iW) _W = W.reshape(G, oCg, iCg, kH, kW) if b is not None: _b = b.reshape(G, oCg) _ys = [] for g in six.moves.range(G): _bg = None if b is None else _b[g, ] if xp is numpy: _y = self._forward_cpu_core(_x[g, ], _W[g, ], _bg) else: _y = self._forward_gpu_core(_x[g, ], _W[g, ], _bg) _ys.append(_y) y = xp.stack(_ys, axis=1) # (N, G, oCg, oH, oW) _, _, _, oH, oW = y.shape y = y.reshape(N, oC, oH, oW) return y
def forward(self, inputs): self.retain_inputs(tuple(range(len(inputs)))) e1 = _as_mat(inputs[0]) e2 = _as_mat(inputs[1]) W = inputs[2] xp = cuda.get_array_module(*inputs) if xp is numpy: # optimize: y = numpy.einsum('ij,ik,jkl->il', e1, e2, W) y = numpy.tensordot(numpy.einsum('ij,ik->ijk', e1, e2), W, axes=2) else: i_len, j_len = e1.shape k_len = e2.shape[1] # 'ij,ik->ijk' e1e2 = e1[:, :, None] * e2[:, None, :] # ijk->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = W.reshape(-1, W.shape[2]) # 'i[jk],[jk]l->il' y = e1e2.dot(W_mat) if len(inputs) == 6: V1, V2, b = inputs[3:] y += e1.dot(V1) y += e2.dot(V2) y += b return y,
def variance(self): warnings.warn("Variance of the cauchy distribution is undefined.", RuntimeWarning) xp = cuda.get_array_module(self.loc) return chainer.as_variable(xp.full_like(self.loc.data, xp.nan))
def make_origin_like(x): xp = cuda.get_array_module(x) origin = xp.zeros_like(x) origin[..., 0] = 1 return origin
def backward(self, indexes, grad_outputs): x, W, gy = self.get_retained_inputs() xp = cuda.get_array_module(x.data) if 0 in indexes: gx = chainer.Variable(xp.zeros_like(x.data)) if 1 in indexes: gW = chainer.Variable(xp.zeros_like(W.data)) if 2 in indexes: ggy = chainer.Variable(xp.zeros_like(gy.data)) ggx, _, ggW = grad_outputs pos_neg_mask = xp.ones(self.sample_size + 1) pos_neg_mask[0] *= -1 for i in xp.arange(len(self.ignore_mask))[self.ignore_mask]: # Partial forward pass to obtain intermediate `Variable`s ix = x[i] k = self.samples[i] if self.reduce == 'sum': igy = gy else: igy = gy[i] w = W[k] f = chainer.functions.flatten( chainer.functions.matmul(w, ix[:, None])) * pos_neg_mask sigf = chainer.functions.sigmoid(f) g = chainer.functions.broadcast_to(igy, f.shape) * sigf \ * pos_neg_mask dgW_dg = chainer.functions.flatten( chainer.functions.matmul(ggW[k], ix[:, None])) * pos_neg_mask dgW_df = chainer.functions.broadcast_to(igy, f.shape) \ * _sigmoid_grad(f, sigf, dgW_dg) * pos_neg_mask dgx_dg = chainer.functions.flatten( chainer.functions.matmul(ggx[i][None, :], w, transb=True)) dgx_df = chainer.functions.broadcast_to(igy, f.shape) \ * _sigmoid_grad(f, sigf, dgx_dg) if 0 in indexes: # deriative of gx dgx = chainer.functions.matmul(w, dgx_df[:, None], transa=True) # derivative of gW dgx += chainer.functions.matmul(g[None, :], ggW[k]).T dgx += chainer.functions.matmul(w, dgW_df[:, None], transa=True) gx = chainer.functions.scatter_add( gx, i, chainer.functions.flatten(dgx)) if 1 in indexes: # deriative of gx shape = ggx[i].shape for ik, ig, idgx_df in six.moves.zip(k, g, dgx_df): ig = chainer.functions.broadcast_to(ig, shape) idgx_df = chainer.functions.broadcast_to(idgx_df, shape) gW = chainer.functions.scatter_add( gW, ik, ig * ggx[i] + idgx_df * ix) # derivative of gW gW = chainer.functions.scatter_add( gW, k, chainer.functions.matmul(dgW_df[:, None], ix[None, :])) if 2 in indexes: dgx_dg *= pos_neg_mask dggy = chainer.functions.sum((dgx_dg + dgW_dg) * sigf) if self.reduce == 'sum': ggy += dggy else: ggy = chainer.functions.scatter_add(ggy, i, dggy) ret = [] if 0 in indexes: ret.append(gx) if 1 in indexes: ret.append(gW) if 2 in indexes: ret.append(ggy) return ret
def forward(self, xs): xp = cuda.get_array_module(*xs) return xp.hstack(xs),
def inv_exponential_map(x, z): xp = cuda.get_array_module(x, z) alpha = -lorentzian_product(x, z, keepdims=True) C = xp.arccosh(-lorentzian_product(x, z, keepdims=True)) \ / xp.sqrt(xp.maximum(alpha ** 2 - 1, eps)) return C * (z - alpha * x)
def forward(self, inputs): self.retain_inputs((0, 1)) x, gamma, beta = inputs xp = cuda.get_array_module(x) if self.running_mean is None: self.running_mean = xp.zeros_like(gamma) self.running_var = xp.zeros_like(gamma) self.axis = _compute_axis(x.ndim, gamma.ndim, self.axis) self.key_axis = _compute_key_axis(x.ndim, gamma.ndim, self.axis) if all(x.shape[i] == 1 for i in self.axis): if 0 in self.axis: warnings.warn( 'A batch with no more than one sample has been given' ' to F.batch_normalization. F.batch_normalization' ' will always output a zero tensor for such batches.' ' This could be caused by incorrect configuration in' ' your code (such as running evaluation while' ' chainer.config.train=True),' ' but could also happen in the last batch of training' ' if non-repeating iterator is used.', UserWarning) else: warnings.warn( 'F.batch_normalization received a batch with single' ' dimensions along all axes that are used for aggregating' ' statistics. F.batch_normalization' ' will always output a zero tensor for such batches.', UserWarning) # TODO(niboshi): Refactor calculation of expander and axis into a # function and call it just before they are used. # expander inserts singleton dimensions to gamma and beta so that they # can be broadcasted with x. expander = [None for _ in range(x.ndim)] for i in self.key_axis: expander[i] = slice(None) expander = tuple(expander) self.expander = expander self.mode = _BNMode(x, gamma, self.key_axis) self.use_cudnn = self.mode.can_use_cudnn(xp) self.use_ideep = self.mode.can_use_ideep() if self.use_ideep: # TODO(niboshi): Refactor iDeep part into a separate method expand_dim = False if x.ndim == 2: expand_dim = True x = x[:, :, None, None] gamma = gamma[expander] beta = beta[expander] W = numpy.concatenate((gamma, beta), axis=0).reshape((2, -1)) y, self.mean, self.var, self.inv_std = ( intel64.ideep.batchNormalization.Forward( intel64.ideep.array(x), intel64.ideep.array(W), None, None, self.eps)) m = x.size // gamma.size adjust = m / max(m - 1., 1.) # Update running_mean if isinstance(self.running_mean, intel64.ideep.mdarray): self.running_mean.inplace_axpby(self.decay, (1 - self.decay), self.mean) else: self.running_mean *= self.decay self.running_mean += self.mean * (1 - self.decay) # Update running_var if isinstance(self.running_var, intel64.ideep.mdarray): self.running_var.inplace_axpby(self.decay, (1 - self.decay), self.var * adjust) else: self.running_var *= self.decay self.running_var += self.var * adjust * (1 - self.decay) if expand_dim: y = numpy.squeeze(y, axis=(2, 3)) elif self.use_cudnn: # TODO(niboshi): Refactor cuDNN part into a separate method x = cuda.cupy.ascontiguousarray(x) gamma = cuda.cupy.ascontiguousarray(gamma) beta = cuda.cupy.ascontiguousarray(beta) dtype = x.dtype handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(_as4darray(x, self.mode)) cudnn_mode = self.mode.get_cudnn_mode() derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor() libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value, x_desc.value, cudnn_mode) dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) if dtype_param is not dtype: gamma = gamma.astype(dtype_param) beta = beta.astype(dtype_param) running_mean = self.running_mean.astype(dtype_param) running_var = self.running_var.astype(dtype_param) else: running_mean = self.running_mean running_var = self.running_var oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes y = cuda.cupy.empty_like(x) # Factor used in the moving average factor = 1 - self.decay if self.mean is None: # Output cache to speed up backward pass. self.mean = xp.empty_like(gamma) # Output cache to speed up backward pass. self.inv_std = xp.empty_like(gamma) # Note: cuDNN computes the mini-batch mean and variance # internally. We can simply (optionally) pass # it the running-average mean and variance arrays. # Note: This API seems to set the inverse of the standard deviation # (instead of variance) to resultSaveInvVariance argument. The # current implementation of our BN depends on this behavior so that # we can reduce the number of reduction kernels. libcudnn.batchNormalizationForwardTraining( handle, cudnn_mode, one.data, zero.data, x_desc.value, x.data.ptr, x_desc.value, y.data.ptr, derivedBnDesc.value, gamma.data.ptr, beta.data.ptr, factor, running_mean.data.ptr, running_var.data.ptr, self.eps, self.mean.data.ptr, self.inv_std.data.ptr) # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used, # there is a possibility of numerical overflow. You can use # queryRuntimeError() to make sure whether the overflow actually # occured or not during the batch normalization. if (cudnn_mode is libcudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT and configuration.config.debug): query_mode = libcudnn.CUDNN_ERRQUERY_BLOCKING rstatus = libcudnn.queryRuntimeError(handle, query_mode) if rstatus is not libcudnn.CUDNN_STATUS_SUCCESS: warnings.warn( 'A numerical overflow might have happend in cuDNN' 'batch normalization (status:{})'.format(rstatus)) if dtype_param is not dtype: # When data type of prameters is converted, say, from fp16 # to fp32, the values of fp32 arrays of running_mean and # running_var updated by batchNormalizationForwardTraining # must be explicitly written back to their original fp16 # arrays. running_mean = running_mean.astype(dtype) running_var = running_var.astype(dtype) self.running_mean.data.copy_from(running_mean.data, running_mean.nbytes) self.running_var.data.copy_from(running_var.data, running_var.nbytes) else: # Generic CPU and GPU implementation gamma = gamma[expander] beta = beta[expander] self.mean = x.mean(axis=self.axis) var = x.var(axis=self.axis) if xp is numpy: self.inv_std = numpy.reciprocal( numpy.sqrt(var + self.eps, dtype=x.dtype)) else: self.inv_std = cuda.cupyx.rsqrt(var + self.eps) y = _apply_bn_fwd(xp, x, self.mean[expander], self.inv_std[expander], gamma, beta) # Update running statistics m = x.size // gamma.size adjust = m / max(m - 1., 1.) # unbiased estimation self.running_mean *= self.decay self.running_mean += (1 - self.decay) * self.mean self.running_var *= self.decay self.running_var += (1 - self.decay) * adjust * var return y,
def forward(self, inputs): self.retain_inputs((0, 1, 3, 4)) x, gamma, beta, mean, var = inputs xp = cuda.get_array_module(x) self.axis = _compute_axis(x.ndim, gamma.ndim, self.axis) self.key_axis = _compute_key_axis(x.ndim, gamma.ndim, self.axis) # expander inserts singleton dimensions to gamma and beta so that they # can be broadcasted with x. expander = [None for _ in range(x.ndim)] for i in self.key_axis: expander[i] = slice(None) expander = tuple(expander) self.expander = expander mode = _BNMode(x, gamma, self.key_axis, inference=True) if mode.can_use_ideep(): # TODO(niboshi): Refactor iDeep part into a separate method expand_dim = False if x.ndim == 2: expand_dim = True x = x[:, :, None, None] gamma = gamma[expander] beta = beta[expander] W = numpy.concatenate((gamma, beta), axis=0).reshape((2, -1)) y, = intel64.ideep.batchNormalization.Forward( intel64.ideep.array(x), intel64.ideep.array(W), intel64.ideep.array(mean), intel64.ideep.array(var), self.eps) if expand_dim: y = numpy.squeeze(y, axis=(2, 3)) # lazy self.inv_var = None self.inv_std = None elif mode.can_use_cudnn(xp): # TODO(niboshi): Refactor cuDNN part into a separate method x = cuda.cupy.ascontiguousarray(x) gamma = cuda.cupy.ascontiguousarray(gamma) beta = cuda.cupy.ascontiguousarray(beta) dtype = x.dtype handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(_as4darray(x, mode)) cudnn_mode = mode.get_cudnn_mode() derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor() libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value, x_desc.value, cudnn_mode) dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) if dtype_param is not dtype: gamma = gamma.astype(dtype_param) beta = beta.astype(dtype_param) mean = mean.astype(dtype_param) var = var.astype(dtype_param) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes y = cuda.cupy.empty_like(x) libcudnn.batchNormalizationForwardInference( handle, cudnn_mode, one.data, zero.data, x_desc.value, x.data.ptr, x_desc.value, y.data.ptr, derivedBnDesc.value, gamma.data.ptr, beta.data.ptr, mean.data.ptr, var.data.ptr, self.eps) else: # Generic CPU and GPU implementation gamma = gamma[expander] beta = beta[expander] var = var + self.eps self.inv_var = xp.reciprocal(var) self.inv_std = xp.sqrt(self.inv_var, dtype=self.inv_var.dtype) y = _apply_bn_fwd(xp, x, mean[expander], self.inv_std[expander], gamma, beta) return y,
def forward(self, inputs): self.retain_inputs((0, 1, 2)) x, gamma, gy = inputs expander = self.expander inv_m = gamma.dtype.type(1. / (x.size // gamma.size)) xp = cuda.get_array_module(x) if self.use_ideep: # TODO(niboshi): Refactor iDeep part into a separate method expand_dim = False if x.ndim == 2: expand_dim = True x = x[:, :, None, None] gy = gy[:, :, None, None] gamma = gamma[expander] beta = numpy.zeros_like(gamma) W = numpy.concatenate((gamma, beta), axis=0).reshape((2, -1)) gx, gW = intel64.ideep.batchNormalization.Backward( intel64.ideep.array(x), intel64.ideep.array(gy), self.mean, self.var, intel64.ideep.array(W), self.eps) ggamma, gbeta = gW[:2] if expand_dim: gx = numpy.squeeze(gx, axis=(2, 3)) elif self.use_cudnn: # TODO(niboshi): Refactor cuDNN part into a separate method x = cuda.cupy.ascontiguousarray(x) gamma = cuda.cupy.ascontiguousarray(gamma) gy = cuda.cupy.ascontiguousarray(gy) dtype = x.dtype handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(_as4darray(x, self.mode)) cudnn_mode = self.mode.get_cudnn_mode() derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor() libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value, x_desc.value, cudnn_mode) dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) if dtype_param is not dtype: gamma = gamma.astype(dtype_param) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) ggamma = cuda.cupy.empty_like(gamma) gbeta = cuda.cupy.empty_like(gamma) libcudnn.batchNormalizationBackward( handle, cudnn_mode, one.data, zero.data, one.data, zero.data, x_desc.value, x.data.ptr, x_desc.value, gy.data.ptr, x_desc.value, gx.data.ptr, derivedBnDesc.value, gamma.data.ptr, ggamma.data.ptr, gbeta.data.ptr, self.eps, self.mean.data.ptr, self.inv_std.data.ptr) # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used, # there is a possibility of numerical overflow. You can use # queryRuntimeError() to make sure whether the overflow actually # occured or not during the batch normalization. if (cudnn_mode is libcudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT and configuration.config.debug): query_mode = libcudnn.CUDNN_ERRQUERY_BLOCKING rstatus = libcudnn.queryRuntimeError(handle, query_mode) if rstatus is not libcudnn.CUDNN_STATUS_SUCCESS: warnings.warn( 'A numerical overflow might have happend in cuDNN' 'batch normalization (status:{})'.format(rstatus)) if dtype_param is not dtype: ggamma = ggamma.astype(dtype) gbeta = gbeta.astype(dtype) else: # CPU and GPU implementation gbeta = gy.sum(axis=self.axis) x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander]) ggamma = (gy * x_hat).sum(axis=self.axis) if xp is numpy: gx = (gamma * self.inv_std)[expander] * ( gy - (x_hat * ggamma[expander] + gbeta[expander]) * inv_m) else: gx = cuda.elementwise( ''' T gy, T x_hat, T gamma, T inv_std, T ggamma, T gbeta, T inv_m ''', 'T gx', ''' gx = (gamma * inv_std) * ( gy - (x_hat * ggamma + gbeta) * inv_m) ''', 'bn_bwd')(gy, x_hat, gamma[expander], self.inv_std[expander], ggamma[expander], gbeta[expander], inv_m) self.retain_outputs((0, 1)) return gx, ggamma, gbeta
def _to_fcontiguous(arrays): xp = cuda.get_array_module(*arrays) return [xp.asfortranarray(a) for a in arrays]
def forward(self, inputs): self.retain_inputs(()) self._in_ndim = inputs[0].ndim xp = cuda.get_array_module(*inputs) return xp.rollaxis(inputs[0], self.axis, self.start),
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this function supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA if kwargs: argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): handle = cudnn.get_handle() states = cuda.get_cudnn_dropout_states() cudnn.set_dropout_descriptor(states._desc, handle, dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) w = n_step_rnn.cudnn_rnn_weight_concat(n_layers, states, use_bi_direction, 'gru', ws, bs) if use_bi_direction: rnn = NStepBiGRU else: rnn = NStepGRU hy, ys = rnn(n_layers, states, lengths)(hx, w, xs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, ys else: hy, _, ys = n_step_rnn.n_step_rnn_impl(_gru, n_layers, dropout_ratio, hx, None, ws, bs, xs, use_bi_direction) return hy, ys
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) # flatten all input variables inputs = tuple( itertools.chain((hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs, ))) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh elif activation == 'relu': rnn = NStepBiRNNReLU else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh elif activation == 'relu': rnn = NStepRNNReLU hy, ys = rnn(n_layers, states, lengths)(*inputs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [xw for xw, _ in ws] hws = [hw for _, hw in ws] xbs = [xb for xb, _ in bs] hbs = [hb for _, hb in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = ( linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward) ] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = {} # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) def get_grad(node): if node is None: return None if node in grads: return grads[node] return node.grad_var while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = [ i for i, x in enumerate(inputs) if x.requires_grad ] if not target_input_indexes: continue outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in inputs]) out_grad = tuple([get_grad(y) for y in outputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. # # Note (Tokui): When the same variable is passed to multiple input # slots (e.g. an expression like ``f(x, x)``), it makes the # gradient accumulation complicated since the back-propagated # gradients w.r.t. the first and second argument should be # accumulated to the current gradient w.r.t. the same variable. # In this case, the current implementation passes the current # gradient only to the first occurrence of the variable in the # input tuple and passes ``None`` to the rest of the occurrences. # For example, when the input variables are ``(x, x)``, the # input gradient passed to the ``backward_accumulate`` method is # ``(gx, None)`` where ``gx`` is the current gradient of ``x``. # See also the docstring of ``FunctionNode.backward_accumulate``. target_inputs = [inputs[i] for i in target_input_indexes] in_grad = [] for i, index_i in enumerate(target_input_indexes): x = inputs[index_i] if x in target_inputs[:i]: # Pass ``None`` for duplicated input variables except for # the first occurrence (see the comment above). gx = None elif x in grads: gx = grads[x] elif x.creator_node is None: x._check_old_style_gradient() # accumulate the gradient only if the node is a leaf gx = x.grad_var else: gx = None in_grad.append(gx) gxs = func.backward_accumulate(target_input_indexes, out_grad, in_grad) assert len(gxs) == len(in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: for gx in gxs: if gx is None: continue gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) if not retain_grad: for y in outputs: if y is not None and y is not self.node: grads[y] = None y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = None for i, gx in enumerate(gxs): if gx is None: continue x = target_inputs[i] if not x.requires_grad: continue _check_grad_type(func, x, gx.data) if x in target_inputs[:i]: # Accumulate the duplicated gradients here. See the comment # above the code that builds ``in_grad``. cur_gx = grads[x] grads[x] = gx if cur_gx is None else gx + cur_gx else: grads[x] = gx x_var = x.get_variable_or_none() if x_var is not None: x_var._grad_var = grads[x] x_var._loss_scale = loss_scale if x.creator_node is not None: add_cand(x.creator_node) del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def __call__(self, imgs, bboxes, labels, scales): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~chainer.Variable): A variable with a batch of images. bboxes (~chainer.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~chainer.Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float or ~chainer.Variable): Amount of scaling applied to the raw image during preprocessing. Returns: chainer.Variable: Scalar loss variable. This is the sum of losses for Region Proposal Network and the head module. """ if isinstance(bboxes, chainer.Variable): bboxes = bboxes.array if isinstance(labels, chainer.Variable): labels = labels.array if isinstance(scales, chainer.Variable): scales = scales.array scales = cuda.to_cpu(scales) batch_size, _, H, W = imgs.shape img_size = (H, W) rpn_features, roi_features = self.light_head_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.light_head_rcnn.rpn(rpn_features, img_size, scales) rpn_locs = rpn_locs.reshape((-1, rpn_locs.shape[2])) rpn_scores = rpn_scores.reshape((-1, rpn_scores.shape[2])) gt_rpn_locs = [] gt_rpn_labels = [] for bbox in bboxes: gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( bbox, anchor, img_size) if cuda.get_array_module(rpn_locs.array) != np: gt_rpn_loc = cuda.to_gpu(gt_rpn_loc) gt_rpn_label = cuda.to_gpu(gt_rpn_label) gt_rpn_locs.append(gt_rpn_loc) gt_rpn_labels.append(gt_rpn_label) del gt_rpn_loc, gt_rpn_label gt_rpn_locs = self.xp.concatenate(gt_rpn_locs, axis=0) gt_rpn_labels = self.xp.concatenate(gt_rpn_labels, axis=0) batch_indices = range(batch_size) sample_rois = [] sample_roi_indices = [] gt_roi_locs = [] gt_roi_labels = [] for batch_index, bbox, label in \ zip(batch_indices, bboxes, labels): roi = rois[roi_indices == batch_index] sample_roi, gt_roi_loc, gt_roi_label = \ self.proposal_target_creator( roi, bbox, label, self.loc_normalize_mean, self.loc_normalize_std) del roi sample_roi_index = self.xp.full((len(sample_roi), ), batch_index, dtype=np.int32) sample_rois.append(sample_roi) sample_roi_indices.append(sample_roi_index) del sample_roi, sample_roi_index gt_roi_locs.append(gt_roi_loc) gt_roi_labels.append(gt_roi_label) del gt_roi_loc, gt_roi_label sample_rois = self.xp.concatenate(sample_rois, axis=0) sample_roi_indices = self.xp.concatenate(sample_roi_indices, axis=0) gt_roi_locs = self.xp.concatenate(gt_roi_locs, axis=0) gt_roi_labels = self.xp.concatenate(gt_roi_labels, axis=0) roi_cls_locs, roi_scores = self.light_head_rcnn.head( roi_features, sample_rois, sample_roi_indices) # RPN losses rpn_loc_loss = _fast_rcnn_loc_loss(rpn_locs, gt_rpn_locs, gt_rpn_labels, self.rpn_sigma) rpn_cls_loss = F.softmax_cross_entropy(rpn_scores, gt_rpn_labels) # Losses for outputs of the head. roi_loc_loss, roi_cls_loss = _ohem_loss( roi_cls_locs, roi_scores, gt_roi_locs, gt_roi_labels, self.n_ohem_sample * batch_size, self.roi_sigma) roi_loc_loss = 2 * roi_loc_loss loss = rpn_loc_loss + rpn_cls_loss + roi_loc_loss + roi_cls_loss chainer.reporter.report( { 'rpn_loc_loss': rpn_loc_loss, 'rpn_cls_loss': rpn_cls_loss, 'roi_loc_loss': roi_loc_loss, 'roi_cls_loss': roi_cls_loss, 'loss': loss }, self) return loss
def init_state(self, param): xp = cuda.get_array_module(param.data) with cuda.get_device_from_array(param.data): self.state['m'] = xp.zeros_like(param.data) self.state['v'] = xp.zeros_like(param.data)
def mean_dice_coefficient(dice_coefficients, ret_nan=True): if ret_nan: xp = cuda.get_array_module(dice_coefficients) selector = ~xp.isnan(dice_coefficients.data) dice_coefficients = F.get_item(dice_coefficients, selector) return F.mean(dice_coefficients, keepdims=True)
def add_noise(h, sigma=0.2): xp = cuda.get_array_module(h.data) if chainer.config.train: return h + sigma * xp.random.randn(*h.shape) else: return h
def head_loss_pre(rois, roi_indices, std, bboxes, labels): thresh = 0.5 batchsize_per_image = 512 fg_ratio = 0.25 xp = cuda.get_array_module(*rois) n_level = len(rois) roi_levels = xp.hstack( xp.array((l,) * len(rois[l])) for l in range(n_level)).astype(np.int32) rois = xp.vstack(rois).astype(np.float32) roi_indices = xp.hstack(roi_indices).astype(np.int32) rois_yx = (rois[:, 2:] + rois[:, :2]) / 2 rois_hw = rois[:, 2:] - rois[:, :2] indices = np.unique(cuda.to_cpu(roi_indices)) gt_locs = xp.empty_like(rois) gt_labels = xp.empty_like(roi_indices) for i in indices: mask = roi_indices == i if len(bboxes[i]) > 0: iou = utils.bbox_iou(rois[mask], bboxes[i]) gt_index = iou.argmax(axis=1) gt_loc = bboxes[i][gt_index].copy() else: gt_loc = xp.empty_like(rois[mask]) # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - rois_yx[mask]) / \ rois_hw[mask] / std[0] gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / rois_hw[mask]) / std[1] if len(bboxes[i]) > 0: gt_label = labels[i][gt_index] + 1 gt_label[iou.max(axis=1) < thresh] = 0 else: gt_label = xp.zeros(int(mask.sum()), dtype=np.int32) fg_index = xp.where(gt_label > 0)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[_choice(fg_index, size=len(fg_index) - n_fg)] = -1 bg_index = xp.where(gt_label == 0)[0] n_bg = batchsize_per_image - int((gt_label > 0).sum()) if len(bg_index) > n_bg: gt_label[_choice(bg_index, size=len(bg_index) - n_bg)] = -1 gt_locs[mask] = gt_loc gt_labels[mask] = gt_label mask = gt_labels >= 0 rois = rois[mask] roi_indices = roi_indices[mask] roi_levels = roi_levels[mask] gt_locs = gt_locs[mask] gt_labels = gt_labels[mask] masks = [roi_levels == l for l in range(n_level)] rois = [rois[mask] for mask in masks] roi_indices = [roi_indices[mask] for mask in masks] gt_locs = [gt_locs[mask] for mask in masks] gt_labels = [gt_labels[mask] for mask in masks] return rois, roi_indices, gt_locs, gt_labels
def forward(self, inputs): x, = inputs ret = x.sum(axis=self.axis, keepdims=self.keepdims) if cuda.get_array_module(x) is numpy: ret = numpy.asarray(ret) return ret,
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) # Check for input array types if not chainer.is_arrays_compatible(in_data): raise TypeError( 'incompatible array types are mixed in the forward input ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in in_data))) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError( 'forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction, **kwargs): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): The number of layers. dropout_ratio(float): Dropout ratio. hx (~chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is the number of layers and is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N`` is the dimension of the hidden units. cx (~chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of :class:`~chainer.Variable`): Weight matrices. ``ws[i]`` represents the weights for the i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` corresponds to :math:`W_j` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they are multiplied with input variables, where ``I`` is the size of the input and ``N`` is the dimension of the hidden units. All other matrices are ``(N, N)``-shaped. bs (list of list of :class:`~chainer.Variable`): Bias vectors. ``bs[i]`` represents the biases for the i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` corresponds to :math:`b_j` in the equation. The shape of each matrix is ``(N,)``. xs (list of :class:`~chainer.Variable`): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the mini-batch size for time ``t``. The sequences must be transposed. :func:`~chainer.functions.transpose_sequence` can be used to transpose a list of :class:`~chainer.Variable`\\ s each representing a sequence. When sequences has different lengths, they must be sorted in descending order of their lengths before transposing. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is the same as ``hx``. - ``cy`` is an updated cell states whose shape is the same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is the mini-batch size for time ``t``. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs,))) if use_bi_direction: rnn = NStepBiLSTM else: rnn = NStepLSTM hy, cy, ys = rnn(n_layers, states, lengths)(*inputs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, cy, ys else: return n_step_rnn.n_step_rnn_impl( _lstm, n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction)
def forward(self, inputs): xp = cuda.get_array_module(*inputs) return xp.pad(inputs[0], self.pad_width, mode=self.mode, **self.keywords),
def compute_inv_2factors(self, cov_ema, damping): xp = cuda.get_array_module(cov_ema) with cuda.get_device_from_array(cov_ema): dmp = xp.identity(cov_ema.shape[0]) * \ xp.sqrt(damping) return self.compute_inv_core(xp, cov_ema + dmp)
def forward(self, inputs): self.retain_inputs((0, )) x = inputs[0] xp = cuda.get_array_module(x) return utils.force_array(xp.log2(x)),
def backward(self, indexes, grad_outputs): gy, = grad_outputs gx = gy * cuda.get_array_module(gy).sign(self.diff) return gx, -gx
def compute_kfgrads(self, param_W, param_b, invs): data = (param_W.grad, param_b.grad, invs) \ if param_b is not None else (param_W.grad, invs) xp = cuda.get_array_module(*data) with cuda.get_device_from_array(*data): return self.compute_kfgrads_core(xp, param_W, param_b, invs)
def forward(self, inputs): xp = cuda.get_array_module(*inputs) if hasattr(xp, 'flip'): # numpy.flip is supported from version 1.12.0 return xp.flip(inputs[0], self.axis), else: return _flip(inputs[0], self.axis),
def compute_covs(self): data = self.in_acts, self.out_grads xp = cuda.get_array_module(*data) with cuda.get_device_from_array(*data): return self.compute_covs_core(xp, *data)
def exponential_map(x, v): xp = cuda.get_array_module(x, v) vnorm = xp.sqrt(xp.maximum(lorentzian_product(v, keepdims=True), eps)) return xp.cosh(vnorm) * x + xp.sinh(vnorm) * v / vnorm
def connectionist_temporal_classification(x, t, blank_symbol, input_length=None, label_length=None, reduce='mean'): """Connectionist Temporal Classification loss function. Connectionist Temporal Classification(CTC) [Graves2006]_ is a loss function of sequence labeling where the alignment between the inputs and target is unknown. See also [Graves2012]_ The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the samplewise loss values. If it is ``'mean'``, it takes the mean of loss values. Args: x (list or tuple of :class:`~chainer.Variable`): A list of unnormalized probabilities for labels. Each element of ``x``, ``x[i]`` is a :class:`~chainer.Variable` object, which has shape ``(B, V)``, where ``B`` is the batch size and ``V`` is the number of labels. The softmax of ``x[i]`` represents the probabilities of the labels at time ``i``. t (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): A matrix including expected label sequences. Its shape is ``(B, M)``, where ``B`` is the batch size and ``M`` is the maximum length of the label sequences. All elements in ``t`` must be less than ``V``, the number of labels. blank_symbol (int): Index of blank_symbol. This value must be non-negative. input_length (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray` or ``None``): Length of sequence for each of mini batch ``x`` (optional). Its shape must be ``(B,)``. If the ``input_length`` is omitted or ``None``, it assumes that all of ``x`` is valid input. label_length (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray` or ``None``): Length of sequence for each of mini batch ``t`` (optional). Its shape must be ``(B,)``. If the ``label_length`` is omitted or ``None``, it assumes that all of ``t`` is valid input. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding a scalar value of the CTC loss. If ``reduce`` is ``'no'``, the output variable holds array whose shape is `(B,)` where `B` is the number of samples. If it is ``'mean'``, it holds a scalar. .. note:: You need to input ``x`` without applying to activation functions(e.g. softmax function), because this function applies softmax functions to ``x`` before calculating CTC loss to avoid numerical limitations. You also need to apply softmax function to forwarded values before you decode it. .. note:: This function is differentiable only by ``x``. .. note:: This function supports (batch, sequence, 1-dimensional input)-data. .. [Graves2006] Alex Graves, Santiago Fernandez,\ Faustino Gomez, Jurgen Schmidhuber,\ `Connectionist Temporal Classification: Labelling Unsegmented\ Sequence Data with Recurrent Neural Networks\ <ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf>`_ .. [Graves2012] Alex Graves,\ `Supervised Sequence Labelling with Recurrent Neural Networks\ <https://www.cs.toronto.edu/~graves/preprint.pdf>`_ """ if not isinstance(x, collections.Sequence): raise TypeError('x must be a list of Variables') if not isinstance(blank_symbol, int): raise TypeError('blank_symbol must be non-negative integer.') assert 0 <= blank_symbol < x[0].shape[1] # This implementation only supports 1-dimensional data. # TODO(jnishi): Support d(>1)-dimentinal inputs. assert x[0].ndim == 2 xp = cuda.get_array_module(x[0]) if input_length is None: input_length = xp.full(len(x[0]), len(x), dtype=numpy.int32) if label_length is None: label_length = xp.full(len(t), t.shape[1], dtype=numpy.int32) return ConnectionistTemporalClassification(blank_symbol, reduce)( input_length, label_length, t, chainer.functions.stack(x))