def forward(self, inputs): self.retain_inputs((0, 1)) c_prev, x = inputs a, i, f, o = _extract_gates(x) batch = len(x) if isinstance(x, chainer.get_cpu_array_types()): if intel64.should_use_ideep('>=auto'): xp = intel64.ideep.get_array_module(x) else: xp = numpy a = xp.tanh(a) i = _sigmoid(i, xp) f = _sigmoid(f, xp) o = _sigmoid(o, xp) c_next = numpy.empty_like(c_prev) c_next[:batch] = a * i + f * c_prev[:batch] h = o * xp.tanh(c_next[:batch]) else: c_next = cuda.cupy.empty_like(c_prev) h = cuda.cupy.empty_like(c_next[:batch]) cuda.elementwise( 'T c_prev, T a, T i_, T f, T o', 'T c, T h', ''' COMMON_ROUTINE; c = aa * ai + af * c_prev; h = ao * tanh(c); ''', 'lstm_fwd', preamble=_preamble)( c_prev[:batch], a, i, f, o, c_next[:batch], h) c_next[batch:] = c_prev[batch:] self.retain_outputs((0,)) return c_next, h
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False, dy=1, dx=1, out_h=None, out_w=None): n, c, h, w = img.shape if out_h is None: out_h = get_conv_outsize(h, kh, sy, ph, cover_all, dy) assert out_h > 0, 'Height in the output should be positive.' if out_w is None: out_w = get_conv_outsize(w, kw, sx, pw, cover_all, dx) assert out_w > 0, 'Width in the output should be positive.' col = cuda.cupy.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( 'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,' 'int32 dy, int32 dx', 'T col', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky * dy + out_y * sy - ph; int in_x = kx * dx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col = img[in_x + w * (in_y + h * c0)]; } else { col = 0; } ''', 'im2col')(img.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col) return col
def forward(self, inputs): self.retain_inputs((0, 1)) c_prev, x = inputs a, i, f, o = _extract_gates(x) batch = len(x) if isinstance(x, numpy.ndarray): a = numpy.tanh(a) i = _sigmoid(i) f = _sigmoid(f) o = _sigmoid(o) c_next = numpy.empty_like(c_prev) c_next[:batch] = a * i + f * c_prev[:batch] h = o * numpy.tanh(c_next[:batch]) else: c_next = cuda.cupy.empty_like(c_prev) h = cuda.cupy.empty_like(c_next[:batch]) cuda.elementwise( 'T c_prev, T a, T i_, T f, T o', 'T c, T h', ''' COMMON_ROUTINE; c = aa * ai + af * c_prev; h = ao * tanh(c); ''', 'lstm_fwd', preamble=_preamble)( c_prev[:batch], a, i, f, o, c_next[:batch], h) c_next[batch:] = c_prev[batch:] self.retain_outputs((0,)) return c_next, h
def forward_gpu(self, x): if chainer.should_use_cudnn('>=auto') and 2 <= self.ndim <= 3: # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. return super(MaxPoolingND, self).forward_gpu(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype n, c = x[0].shape[:2] dims = x[0].shape[2:] ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, self.cover_all) for (d, k, s, p) in six.moves.zip( dims, self.ksize, self.stride, self.pad)) # (n, c, y_1, y_2, ..., y_N) y_shape = (n, c) + ys y = cuda.cupy.empty(y_shape, dtype=x[0].dtype) self.indexes = cuda.cupy.empty(y_shape, dtype=numpy.int32) in_params, out_params, operation, name = \ max_pooling_nd_kernel.MaxPoolingNDKernelForward.generate(self.ndim) cuda.elementwise(in_params, out_params, operation, name)( x[0].reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (y, self.indexes))) return y,
def forward_gpu(self, gys): if self._used_cudnn: x, = self.apoolnd.get_retained_inputs() return self.apoolnd.backward_gpu((x.data,), gys) is_pad_value_none = self.pad_value is None gy, = gys n, c = self._in_shape[:2] idims = self._in_shape[2:] odims = gy.shape[2:] if is_pad_value_none: coeff = self.apoolnd.coeff # This conversion from chainerx to cupy exists here for # double backward of chainerx on cuda. coeff = backend.from_chx(coeff) gy *= coeff gx = cuda.cupy.empty(self._in_shape, self._in_dtype) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate( self.ndim) cuda.elementwise(in_params, out_params, operation, name)( gy.reduced_view(), *(idims + odims + self.ksize + self.stride + self.pad + (gx,))) if not is_pad_value_none: gx /= functools.reduce(operator.mul, self.ksize) return gx,
def col2im_gpu(col, sy, sx, ph, pw, h, w, dy=1, dx=1): n, c, kh, kw, out_h, out_w = col.shape img = cuda.cupy.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( 'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,' 'int32 dx, int32 dy', 'T img', ''' int c0 = i / (h * w); int y = i / w % h; int x = i % w; T val = 0; for (int ky = 0; ky < kh; ++ky) { int out_y = (y + ph - ky * dy); if (0 > out_y || out_y >= out_h * sy) continue; if (out_y % sy != 0) continue; out_y /= sy; for (int kx = 0; kx < kw; ++kx) { int out_x = (x + pw - kx * dx); if (0 > out_x || out_x >= out_w * sx) continue; if (out_x % sx != 0) continue; out_x /= sx; int k = out_y + out_h * (kx + kw * (ky + kh * c0)); val = val + col[out_x + out_w * k]; } } img = val; ''', 'col2im')(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dx, dy, img) return img
def forward_gpu(self, gy): if self._used_cudnn: x, = self.apool2d.get_retained_inputs() return self.apool2d.backward_gpu((x.data,), gy) n, c, h, w = self._in_shape y_h, y_w = gy[0].shape[2:] gx = cuda.cupy.empty(self._in_shape, self._in_dtype) coeff = 1. / (self.kh * self.kw) cuda.elementwise( 'raw T gy, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T gx', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); int hc0 = out_h * c0; T val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { val = val + gy[out_x + out_w * (out_y + hc0)]; } } gx = val * coeff; ''', 'avg_pool_bwd')(gy[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff, gx) return gx,
def forward_gpu(self, inputs): utils.nondeterministic('atomicAdd') self.retain_inputs((0, 1, 2)) x, W, gy = inputs if self.reduce == 'no': gy = gy[:, None] samples = self.samples wx = self.wx.astype(x.dtype, copy=False) g = cuda.elementwise( 'T wx, T gy, int32 m', 'T g', ''' T y; if (i % m == 0) { y = 1; } else { y = -1; } g = -y * gy / (1.0f + __expf(wx * y)); ''', 'negative_sampling_calculate_g' )(wx, gy, self.sample_size + 1) cupy = cuda.cupy gx = cupy.zeros_like(x) n_in = x.shape[1] cuda.elementwise( 'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx', ''' int d = i / c; T w = 0; if (mask == 1){ for (int j = 0; j < m; ++j) { w += g[d * m + j] * W[k[d * m + j] * c + i % c]; } } gx = w; ''', 'negative_sampling_calculate_gx' )(g, W, self.ignore_mask[:, None], samples, n_in, self.sample_size + 1, gx) gW = cupy.zeros_like(W) cuda.elementwise( 'T g, raw T x, S k, bool mask, int32 c, int32 m', 'raw T gW', ''' T gi = g; if (mask == 1) { for (int j = 0; j < c; ++j) { atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]); } } ''', 'negative_sampling_calculate_gw' )(g, x, samples, self.ignore_mask[:, None], n_in, self.sample_size + 1, gW) return gx, None, gW
def forward_gpu(self, inputs): if self._used_cudnn: x, = self.mpoolnd._cudnn_inputs return self._forward_gpu_compute_indexes_again((x, inputs[0])) x, = inputs self._in_shape = x.shape self._in_dtype = x.dtype n, c = x.shape[:2] dims = x.shape[2:] ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, self.cover_all) for (d, k, s, p) in six.moves.zip( dims, self.ksize, self.stride, self.pad)) # (n, c, y_1, y_2, ..., y_N) y_shape = (n, c) + ys y = cuda.cupy.empty(y_shape, dtype=x.dtype) cls = max_pooling_nd_kernel.MaxPoolingNDKernelForwardWithIndexes in_params, out_params, operation, name = cls.generate(self.ndim) cuda.elementwise(in_params, out_params, operation, name)( x.reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (self.indexes.reduced_view(), y))) return y,
def forward_cudnn(self, inputs): if self.eps < libcudnn.CUDNN_BN_MIN_EPSILON: raise RuntimeError( 'cuDNN does not allow an eps value ' 'less than {}.'.format(libcudnn.CUDNN_BN_MIN_EPSILON)) self.retain_inputs((0, 1)) x, gamma, beta = inputs xp = cuda.cupy orig_shape = x.shape batch_size, channels = orig_shape[:2] groups = self.groups cudnn_shape = (1, batch_size * groups, -1, 1) x = x.reshape(cudnn_shape) with cuda.get_device_from_array(x): dummy_beta = xp.zeros(batch_size * groups, dtype=x.dtype) self.dummy_gamma = xp.ones_like(dummy_beta) x_hat, self.mean, self.inv_std = \ cudnn.batch_normalization_forward_training( x, self.dummy_gamma, dummy_beta, dummy_beta, dummy_beta, None, None, self.eps, 1.0, True, libcudnn.CUDNN_BATCHNORM_SPATIAL, configuration.config.debug) y = x_hat.reshape((batch_size, channels, -1)) cuda.elementwise( 'T gamma, T beta', 'T y', 'y = y * gamma + beta', 'groupnorm_y')(gamma[:, None], beta[:, None], y) y = y.reshape(orig_shape) return y,
def forward_gpu(self, gys): if self._used_cudnn: x, = self.apoolnd.get_retained_inputs() return self.apoolnd.backward_gpu((x.data,), gys) gy, = gys n, c = self._in_shape[:2] idims = self._in_shape[2:] odims = gy.shape[2:] gx = cuda.cupy.empty(self._in_shape, self._in_dtype) if self.pad_value is None: coeff = self._get_pooling_width(cuda.cupy, odims, gy.dtype) coeff = cuda.cupy.reciprocal(coeff, out=coeff) else: coeff = 1. / functools.reduce(operator.mul, self.ksize) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate( self.ndim) cuda.elementwise(in_params, out_params, operation, name)( gy.reduced_view(), *(idims + odims + self.ksize + self.stride + self.pad + (coeff, gx))) return gx,
def update_core_gpu(self, param): grad = param.grad if grad is None: return cuda.elementwise('T grad, T lr', 'T param', 'param -= lr * grad', 'sgd')(grad, self.hyperparam.lr, param.data)
def label_probability(self, label_size, path, path_length, multiply_seq, xp): seq_length = len(multiply_seq) n_batch = len(path) dtype = multiply_seq.dtype ret = xp.zeros((seq_length, n_batch, label_size), dtype) if xp == numpy: for b in six.moves.range(len(path)): target_path = path[b, :path_length[b]] chars = {c for c in target_path} for c in chars: ret[:, b, c] = xp.sum( multiply_seq[:, b, 0:path_length[b]] [:, target_path == c], axis=1) else: cuda.elementwise( 'T prob, I path, I path_length, I max_path_length', 'raw T cum_prob', ''' I t = i % max_path_length; if (t < path_length) { int n_batch = cum_prob.shape()[1]; I s = i / (max_path_length * n_batch); I b = (i - s * (max_path_length * n_batch)) / max_path_length; int ind[] = {s, b, path}; atomicAdd(&cum_prob[ind], prob); } ''', 'ctc_label_prob_sum' )(multiply_seq, path, path_length[:, None], path.shape[1], ret) return ret
def forward_gpu(self, inputs): if self._used_cudnn: x, = self.mpool2d.get_retained_inputs() return self._forward_gpu_compute_indexes_again((x.data, inputs[0])) else: x, = inputs n, c, h, w = x.shape y_h = conv.get_conv_outsize( h, self.kh, self.sy, self.ph, self.cover_all) assert y_h > 0, 'Height in the output should be positive.' y_w = conv.get_conv_outsize( w, self.kw, self.sx, self.pw, self.cover_all) assert y_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x.dtype) cuda.elementwise( 'raw T in, raw S indexes, int32 h, int32 w, int32 out_h,' 'int32 out_w, int32 kh, int32 kw, int32 sy, int32 sx,' 'int32 ph, int32 pw', 'T out', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int index = indexes[i]; int max_y = max(0, out_y * sy - ph + index / kw); int max_x = max(0, out_x * sx - pw + index % kw); out = in[max_x + w * (max_y + h * c0)]; ''', 'max_pool_grad_fwd')( x.reduced_view(), self.indexes.reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, y) return y,
def forward(self, inputs): xp = backend.get_array_module(*inputs) c_prev, x, c_next, gc, gh = inputs batch = len(x) gx = xp.empty_like(x) ga, gi, gf, go = _extract_gates(gx) # Consider the case that either gradient is not given if gc is None: gc_update = 0 gc_rest = 0 else: gc_update = gc[:batch] gc_rest = gc[batch:] if gh is None: gh = 0 a, i, f, o = _extract_gates(x) if xp is numpy: if intel64.should_use_ideep('>=auto'): xp = intel64.ideep.get_array_module(x) tanh_a = xp.tanh(a) sig_i = _sigmoid(i, xp) sig_f = _sigmoid(f, xp) sig_o = _sigmoid(o, xp) co = xp.tanh(c_next[:batch]) gc_prev = numpy.empty_like(c_prev) # multiply f later gc_prev[:batch] = gh * sig_o * _grad_tanh(co) + gc_update gc = gc_prev[:batch] ga[:] = gc * sig_i * _grad_tanh(tanh_a) gi[:] = gc * tanh_a * _grad_sigmoid(sig_i) gf[:] = gc * c_prev[:batch] * _grad_sigmoid(sig_f) go[:] = gh * co * _grad_sigmoid(sig_o) gc_prev[:batch] *= sig_f # multiply f here gc_prev[batch:] = gc_rest else: gc_prev = xp.empty_like(c_prev) cuda.elementwise( 'T c_prev, T c, T gc, T gh, T a, T i_, T f, T o', 'T gc_prev, T ga, T gi, T gf, T go', ''' COMMON_ROUTINE; T co = tanh(c); T temp = gh * ao * grad_tanh(co) + gc; ga = temp * ai * grad_tanh(aa); gi = temp * aa * grad_sigmoid(ai); gf = temp * c_prev * grad_sigmoid(af); go = gh * co * grad_sigmoid(ao); gc_prev = temp * af; ''', 'lstm_bwd', preamble=_preamble)( c_prev[:batch], c_next[:batch], gc_update, gh, a, i, f, o, gc_prev[:batch], ga, gi, gf, go) gc_prev[batch:] = gc_rest return gc_prev, gx
def _transpose(xs, length): if length == 0: return () xp = backend.get_array_module(*xs) lengths = numpy.empty(length, dtype=numpy.int32) end = length for i, x in enumerate(xs): len_x = len(x) if len_x == end: continue lengths[len_x:end] = i end = len_x lengths[0:end] = len(xs) if xp is numpy: dtype = xs[0].dtype unit = xs[0].shape[1:] outs = tuple([xp.empty((l,) + unit, dtype=dtype) for l in lengths]) for i, x in enumerate(xs): for p, xi in enumerate(x): outs[p][i] = xi else: offsets1 = numpy.empty(len(xs) + 1, dtype=numpy.int32) offsets1[0] = 0 numpy.cumsum([len(x) for x in xs], out=offsets1[1:]) offsets2 = numpy.empty(length + 1, dtype=numpy.int32) offsets2[0] = 0 numpy.cumsum(lengths, dtype=numpy.int32, out=offsets2[1:]) x = xp.concatenate(xs, axis=0) o = xp.empty_like(x) unit = xs[0].size // len(xs[0]) size = length * len(xs) * unit cuda.elementwise( 'int32 len, int32 unit, raw int32 off1, raw int32 off2, raw T vs', 'raw T hs', ''' int ind = i / unit; int off = i - ind * unit; int y = ind / len; int x = ind - y * len; if (off2[x] + y < off2[x + 1]) { hs[(off2[x] + y) * unit + off] = vs[(off1[y] + x) * unit + off]; } ''', 'transpose_sequence' )(length, unit, cuda.to_gpu(offsets1), cuda.to_gpu(offsets2), x, o, size=size) outs = tuple(xp.split(o, offsets2[1:-1])) return outs
def _computes_transition( self, prev_prob, path, path_length, cum_prob, y): xp = cuda.get_array_module(prev_prob) if xp == numpy: n_batch, max_path_length = path.shape mat = xp.full( (3, n_batch, max_path_length), self.zero_padding, 'f') mat[0, :, :] = prev_prob mat[1, :, 1:] = prev_prob[:, :-1] mat[2, :, 2:] = prev_prob[:, :-2] # disable transition between the same symbols # (including blank-to-blank) same_transition = (path[:, :-2] == path[:, 2:]) mat[2, :, 2:][same_transition] = self.zero_padding prob = _logsumexp(mat, xp, axis=0) outside = xp.arange(max_path_length) >= path_length[:, None] prob[outside] = self.zero_padding cum_prob += prob batch_index = xp.arange(n_batch, dtype='i') prob += y[batch_index[:, None], path] else: prob = xp.empty_like(prev_prob) cuda.elementwise( 'raw T prob, raw I path, I path_length, T zero, raw T y', 'T z, T cum_prob', ''' int length = prob.shape()[1]; int b = i / length; int t = i - b * length; if (t >= path_length) { z = zero; cum_prob += zero; return; } int ind1[] = {b, t}; int ind2[] = {b, t - 1}; int ind3[] = {b, t - 2}; float f1 = prob[ind1]; float f2 = (0 <= t - 1) ? prob[ind2] : zero; float f3 = (0 <= t - 2 && path[ind3] != path[ind1]) ? prob[ind3] : zero; // calculates log-sum-exp float m = max(f1, max(f2, f3)); z = m + log(exp(f1 - m) + exp(f2 - m) + exp(f3 - m)); cum_prob += z; int y_ind[] = {b, path[ind1]}; z += y[y_ind]; ''', 'ctc_transition' )(prev_prob, path, path_length[:, None], self.zero_padding, y, prob, cum_prob) return prob
def forward_gpu(self, x): if chainer.should_use_cudnn('>=auto'): self.retain_inputs((0,)) return super(MaxPooling2D, self).forward_gpu(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype n, c, h, w = x[0].shape y_h = conv.get_conv_outsize( h, self.kh, self.sy, self.ph, self.cover_all) assert y_h > 0, 'Height in the output should be positive.' y_w = conv.get_conv_outsize( w, self.kw, self.sx, self.pw, self.cover_all) assert y_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x[0].dtype) self.indexes = cuda.cupy.empty((n, c, y_h, y_w), dtype=numpy.int32) cuda.elementwise( 'raw T in, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T out, S indexes', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); T maxval = in[in_x_0 + w * (in_y_0 + h * c0)]; int argmax_y = in_y_0; int argmax_x = in_x_0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { float v = in[x + offset_y]; if (maxval < v) { maxval = v; argmax_y = y; argmax_x = x; } } } out = maxval; int argmax_ky = argmax_y + ph - out_y * sy; int argmax_kx = argmax_x + pw - out_x * sx; indexes = argmax_kx + kw * argmax_ky; ''', 'max_pool_fwd')(x[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, y, self.indexes) return y,
def forward_gpu(self, inputs_and_grad_outputs): class_weight = cuda.to_gpu(self.class_weight) cupy = cuda.cupy x, t, gloss = inputs_and_grad_outputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) n_unit = t.size // len(t) if self.coeff is not None: coeff = self.coeff else: gloss = gloss[:, None, ...] coeff = cupy.array(1, dtype=gloss.dtype) # dtype does not matter if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T gloss, U coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); if (t == ignore_label) { gx = T(0); } else { gx = static_cast<T>(gloss * coeff * (y - (c == t))); } ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T gloss, U coeff, ' 'S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); if (t == ignore_label) { gx = T(0); } else { gx = static_cast<T>( gloss * coeff * (y - (c == t)) * w[t]); } ''', 'softmax_crossent_weight_bwd')( y, class_weight, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1], n_unit, self.ignore_label) return gx,
def forward_gpu(self, inputs): gy, = inputs x1, x2 = self.x1, self.x2 gx1 = cuda.elementwise( 'T x1, T x2, T gy', 'T gx1', 'gx1 = (x1 <= x2) ? gy : (T)0.0', 'minimum_bwd1')(x1, x2, gy) gx2 = cuda.elementwise( 'T x1, T x2, T gy', 'T gx1', 'gx1 = (x1 > x2) ? gy : (T)0.0', 'minimum_bwd2')(x1, x2, gy) return utils.sum_to(gx1, x1.shape), utils.sum_to(gx2, x2.shape)
def update_core_gpu(self, param): grad = param.grad if grad is None: return cuda.elementwise( 'T grad, T lr, T momentum', 'T param, T v', '''v = momentum * v - lr * grad; param += v;''', 'momentum_sgd')( grad, self.hyperparam.lr, self.hyperparam.momentum, param.data, self.state['v'])
def forward_gpu(self, x): self.y = cuda.cupy.square(x[0]) # temporary self.scale = cuda.cupy.empty_like(self.y) _cu_conv_sum(self.scale, self.y, self.n) cuda.elementwise( 'T x, T k, T alpha, T beta', 'T y, T scale', '''scale = k + alpha * scale; y = x * pow(scale, -beta);''', 'lrn_fwd')(x[0], self.k, self.alpha, self.beta, self.y, self.scale) return self.y,
def _inverse_indices(indices): xp = cuda.get_array_module(indices) r = xp.empty_like(indices) if xp is numpy: r[indices] = numpy.arange(len(indices)) else: cuda.elementwise( 'S ind', 'raw S r', 'r[ind] = i', 'inverse_indices' )(indices, r) return r
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2)) x, t, W = inputs self.ignore_mask = (t != self.ignore_label) samples = self._make_samples(t) n_in = x.shape[1] self.wx = cuda.elementwise( 'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx', ''' T f = 0; if (mask == 1) { for (int j = 0; j < c; ++j) { int x_ind[] = {(i / m), j}; int w_ind[] = {k, j}; f += x[x_ind] * W[w_ind]; } } wx = f; ''', 'negative_sampling_wx' )(W, x, self.ignore_mask[:, None], samples, n_in, self.sample_size + 1) loss = cuda.elementwise( 'T wx, int32 c, int32 m, bool mask', 'T y', ''' if (mask) { T f = wx; if (i % m == 0) { f = -f; } if (f < 0) { y = __logf(1 + __expf(f)); } else { y = f + __logf(1 + __expf(-f)); } } else { y = 0; } ''', 'negative_sampling_forward' )(self.wx, n_in, self.sample_size + 1, self.ignore_mask[:, None]) if self.reduce == 'sum': loss = loss.sum() else: # 'no': loss = loss.sum(axis=1) self.samples = samples return loss,
def backward_gpu(self, x, gy): summand = cuda.elementwise( 'T scale, T y, T gy', 'T summand', 'summand = y * gy / scale', 'lrn_bwd_summand')(self.scale, self.y, gy[0]) gx = cuda.cupy.empty_like(x[0]) _cu_conv_sum(gx, summand, self.n) cuda.elementwise( ' T x, T gy, T scale, T beta, T coeff', 'T gx', 'gx = pow(scale, -beta) * gy - coeff * x * gx', 'lrn_bwd')(x[0], gy[0], self.scale, self.beta, 2 * self.alpha * self.beta, gx) return gx,
def forward_gpu(self, inputs): x, y, gy = inputs summand = cuda.elementwise( 'T scale, T y, T gy', 'T summand', 'summand = y * gy / scale', 'lrn_bwd_summand')(self.scale, y, gy) gx = cuda.cupy.empty_like(x) _cu_conv_sum(gx, summand, self.n) cuda.elementwise( ' T x, T gy, T scale, T beta, T coeff', 'T gx', 'gx = pow(scale, -beta) * gy - coeff * x * gx', 'lrn_bwd')(x, gy, self.scale, self.beta, 2 * self.alpha * self.beta, gx) return gx,
def update_core_gpu(self, param): grad = param.grad if grad is None: return cuda.elementwise( 'T grad, T one_minus_rho, T eps', 'T param, T msg, T msdx', '''msg = msg + one_minus_rho * (grad * grad - msg); T dx = sqrt((msdx + eps) / (msg + eps)) * grad; msdx += one_minus_rho * (dx * dx - msdx); param -= dx;''', 'adadelta')(grad, 1 - self.hyperparam.rho, self.hyperparam.eps, param.data, self.state['msg'], self.state['msdx'])
def update_core_gpu(self, param): grad = param.grad if grad is None: return cuda.elementwise( 'T grad, T lr, T momentum', 'T param, T v', ''' v = v * momentum - lr * grad; param += momentum * momentum * v - (1 + momentum) * lr * grad; ''', 'nesterov_ag')( grad, self.hyperparam.lr, self.hyperparam.momentum, param.data, self.state['v'])
def backward(self, inputs, grad_outputs): x, gamma, _ = inputs gy = grad_outputs[0] head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim) m = gamma.dtype.type(x.size // gamma.size) axis = (0,) + tuple(range(head_ndim, x.ndim)) xp = backend.get_array_module(x) # Note: we must be in train mode. assert configuration.config.train # NOTE(tommi): cuDNN is not used since it does not support # batch renormalization gbeta = gy.sum(axis=axis) ggamma = (gy * self.x_hat_renorm).sum(axis=axis) gsigma_batch = (gy * self.x_hat).sum(axis=axis) if xp is numpy: scale = (self.r * gamma / self.std)[expander] gx = scale * (gy - (self.x_hat * gsigma_batch[expander] + gbeta[expander]) / m) else: inv_m = numpy.float32(1) / m gx = cuda.elementwise( 'T gy, T x_hat, T gamma, T std, T gsigma_batch, T gbeta, \ T inv_m, T r', 'T gx', 'gx = (r * gamma / std) * (gy - (x_hat * gsigma_batch + gbeta) * \ inv_m)', 'bn_bwd')(gy, self.x_hat, gamma[expander], self.std[expander], gsigma_batch[expander], gbeta[expander], inv_m, self.r[expander]) return gx, ggamma, gbeta
def forward_gpu(self, x): self.retain_inputs((0,)) return cuda.elementwise( 'T x', 'T y', 'y = erfc(x)', 'elementwise_erfc', )(x[0]),
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs # TODO(beam2d): Make it not use the input value = _preprocess_const(x, self.value) gx = cuda.elementwise('T x, T gy, T value', 'T gx', 'gx = -value * gy / (x * x)', 'div_from_const_bwd')(x, gy, value) return gx,
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2)) p, x, y = inputs return cuda.elementwise( 'T p, T x, T y', 'T z', 'z = p * x + (1 - p) * y', 'linear_interpolate_fwd', )(p, x, y),
def forward(self, inputs): xp = backend.get_array_module(*inputs) x, gamma, beta = inputs # Note: we must be in train mode. assert configuration.config.train head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim) # NOTE(tommi): cuDNN is not used since it does not support # batch renormalization axis = (0, ) + tuple(range(head_ndim, x.ndim)) mean = x.mean(axis=axis, dtype=gamma.dtype) var = x.var(axis=axis, dtype=gamma.dtype) self.std = xp.sqrt(var + self.eps, dtype=var.dtype) running_sigma = xp.sqrt(self._running_var + self.eps, dtype=self._running_mean.dtype) self.r = xp.clip(self.std / running_sigma, 1.0 / self.rmax, self.rmax) d = xp.clip((mean - self._running_mean) / running_sigma, -self.dmax, self.dmax) gamma = gamma[expander] beta = beta[expander] if xp is numpy: self.x_hat = _xhat(x, mean, self.std, expander) self.x_hat_renorm = self.x_hat * self.r[expander] + d[expander] y = gamma * self.x_hat_renorm y += beta y = y.astype(dtype=x.dtype) else: self.x_hat, self.x_hat_renorm, y = cuda.elementwise( 'T x, U mean, U std, U gamma, U beta, U r, U d', 'U x_hat, U x_hat_renorm, T y', ''' x_hat = (x - mean) / std; x_hat_renorm = x_hat * r + d; y = gamma * x_hat_renorm + beta; ''', 'brn_fwd')(x, mean[expander], self.std[expander], gamma, beta, self.r[expander], d[expander]) if self.update_statistics: m = x.size // gamma[expander].size self._running_mean *= self.decay adjust = m / max(m - 1., 1.) # unbiased estimation temp_ar = xp.array(mean) temp_ar *= (1 - self.decay) self._running_mean += temp_ar del temp_ar self._running_var *= self.decay temp_ar = xp.array(var) temp_ar *= (1 - self.decay) * adjust self._running_var += temp_ar del temp_ar return y,
def forward_gpu(self, x): self.retain_inputs((0, )) self.retain_outputs((0, )) return cuda.elementwise( 'T x', 'T y', 'y = erfcx(x)', 'elementwise_erfcx', )(x[0]),
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2, 3)) p, x, y, gz = inputs return cuda.elementwise( 'T p, T x, T y, T gz', 'T gp, T gx, T gy', ''' gp = (x - y) * gz; gx = gz * p; gy = gz * (1 - p); ''', 'linear_interpolate_bwd')(p, x, y, gz)
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) y, gy = inputs value = _preprocess_const(y, self.value) gx = cuda.elementwise('T y, T gy, T value', 'T gx', 'gx = log(value) * y * gy', 'pow_const_var_bwd')(y, gy, value) return gx,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs self.val = _preprocess_const(x, self.value) gx = cuda.elementwise('T x, T gy, T value', 'T gx', 'gx = value * pow(x, value - 1) * gy', 'pow_var_const_bwd')(x, gy, self.val) return gx,
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2)) x1, x2, gy = inputs gx1, gx2 = cuda.elementwise('T x1, T x2, T gy', 'T gx1, T gx2', ('T sqnorm = x1 * x1 + x2 * x2;' 'gx1 = x2 / sqnorm * gy;' 'gx2 = -x1 / sqnorm * gy;'), 'arctan2_bwd')(x1, x2, gy) return gx1, gx2
def entropy_gpu(x): vec = cuda.elementwise( 'T x', 'T y', ''' y = (x == 0) ? 0 : -x*log(x); ''', 'entropy')(x.data) return cuda.cupy.sum(vec, 1)
def log_matrix(self, x, xp): if xp == numpy: res = numpy.ma.log(x).filled(fill_value=self.zero_padding) else: create_recurrence_relation = cuda.elementwise( 'T x, T e', 'T y', 'y = x == 0 ? e : log(x)', 'create_recurrence_relation') res = create_recurrence_relation(x, self.zero_padding) return res.astype(numpy.float32)
def forward_gpu(self, inputs): self.retain_inputs((0, )) x = inputs[0] y = cuda.elementwise( 'T x, T beta, T beta_inv', 'T y', ''' T bx = beta * x; y = (max(bx, (T)0) + log1p(exp(-fabs(bx)))) * beta_inv; ''', 'softplus_fwd')(x, self.beta, self.beta_inv) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2)) x0, x1, gy = inputs gx0, gx1 = cuda.elementwise( 'T x0, T x1, T gy', 'T gx0, T gx1', ''' gx0 = gy / x1; gx1 = -gx0 * x0 / x1; ''', 'div_bwd')(x0, x1, gy) return utils.sum_to(gx0, x0.shape), utils.sum_to(gx1, x1.shape)
def forward_gpu(self, inputs): self.retain_inputs((0, )) x, = inputs x_hat = cuda.elementwise('T x, T mean, T inv_std', 'T x_hat', 'x_hat = (x - mean) * inv_std', 'groupnorm_x_hat')(x, self.mean[:, None], self.inv_std[:, None]) self.retain_outputs((0, )) return x_hat,
def update_core_gpu(self, param): grad = param.grad if grad is None: return hp = self.hyperparam eps = grad.dtype.type(hp.eps) if hp.eps != 0 and eps == 0: raise ValueError( 'eps of Adam optimizer is too small for {} ({})'.format( grad.dtype.name, hp.eps)) if hp.amsgrad: if AdamRule._amsgrad_kernel is None: AdamRule._amsgrad_kernel = cuda.elementwise( 'T grad, T alpha_t, T one_minus_beta1, T one_minus_beta2, ' 'T eps, T eta, T weight_decay_rate', 'T param, T m, T v, T vhat', '''m += one_minus_beta1 * (grad - m); v += one_minus_beta2 * (grad * grad - v); vhat = max(vhat, v); param -= eta * (alpha_t * m / (sqrt(vhat) + eps) + weight_decay_rate * param);''', 'adam') AdamRule._amsgrad_kernel( grad, self.alpha_t, 1 - hp.beta1, 1 - hp.beta2, hp.eps, hp.eta, hp.weight_decay_rate, param.data, self.state['m'], self.state['v'], self.state['vhat']) else: if AdamRule._kernel is None: AdamRule._kernel = cuda.elementwise( 'T grad, T alpha_t, T one_minus_beta1, T one_minus_beta2, ' 'T eps, T eta, T weight_decay_rate', 'T param, T m, T v', '''m += one_minus_beta1 * (grad - m); v += one_minus_beta2 * (grad * grad - v); param -= eta * (alpha_t * m / (sqrt(v) + eps) + weight_decay_rate * param);''', 'adam') AdamRule._kernel(grad, self.alpha_t, 1 - hp.beta1, 1 - hp.beta2, hp.eps, hp.eta, hp.weight_decay_rate, param.data, self.state['m'], self.state['v'])
def forward(self, xs): xp = backend.get_array_module(*xs) if self.length is None: length = max(len(x) for x in xs) else: length = self.length shape = (len(xs), length) + xs[0].shape[1:] y = xp.empty(shape, xs[0].dtype) if length == 0: return y, # y is an empty array if xp is numpy or any(not x._c_contiguous for x in xs): for i, x in enumerate(xs): l = len(x) if l == length: y[i] = x else: y[i, 0:l] = x y[i, l:] = self.padding else: # This code assumes that all arrays are c_contiguous ptr_shape = (Ellipsis,) + (None,) * xs[0].ndim ptrs = cuda.cupy.array( [x.data for x in xs], numpy.uintp)[ptr_shape] lengths = cuda.cupy.array( [len(x) for x in xs], numpy.int32)[ptr_shape] base = utils.size_of_shape(xs[0].shape[1:]) cuda.elementwise( 'P ptr, int32 length, T pad, int32 base, int32 max_length', 'T y', ''' int d = i / base % max_length; if (d < length) { y = reinterpret_cast<const T*>(ptr)[i % (base * max_length)]; } else { y = pad; } ''', 'pad_sequence_fwd' )(ptrs, lengths, self.padding, base, length, y) return y,
def forward_gpu(self, inputs): x, t, W = inputs self.ignore_mask = (t != self.ignore_label) n_in = x.shape[1] self._make_samples(t) self.wx = cuda.elementwise( 'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx', ''' T f = 0; if (mask == 1) { for (int j = 0; j < c; ++j) { int x_ind[] = {(i / m), j}; int w_ind[] = {k, j}; f += x[x_ind] * W[w_ind]; } } wx = f; ''', 'negative_sampling_wx')(W, x, self.ignore_mask[:, None], self.samples, n_in, self.sample_size + 1) loss = cuda.elementwise( 'T wx, int32 c, int32 m, bool mask', 'T y', ''' if (mask) { T f = wx; if (i % m == 0) { f = -f; } if (f < 0) { y = __logf(1 + __expf(f)); } else { y = f + __logf(1 + __expf(-f)); } } else { y = 0; } ''', 'negative_sampling_forward')(self.wx, n_in, self.sample_size + 1, self.ignore_mask[:, None]) if self.reduce == 'sum': loss = loss.sum() else: # 'no': loss = loss.sum(axis=1) return loss,
def forward_gpu(self, inputs): gy, = inputs gx1, gx2 = cuda.elementwise( 'S cond, T gy', 'T gx1, T gx2', ''' gx1 = cond ? gy : (T)0.0; gx2 = cond ? (T)0.0 : gy; ''', 'maximum_bwd1')(self.cond, gy) return gx1, gx2
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2)) x0, x1, gy = inputs gx0, gx1 = cuda.elementwise( 'T x0, T x1, T gy, T y', 'T gx0, T gx1', ''' gx0 = x1 * pow(x0, x1 - 1) * gy; gx1 = log(x0) * y * gy; ''', 'pow_var_var_bwd')(x0, x1, gy, self.y) return gx0, gx1
def update_core_gpu(self, param): grad = param.grad if grad is None: return if AdaGradRule._kernel is None: AdaGradRule._kernel = cuda.elementwise( 'T grad, T lr, T eps', 'T param, T h', '''h += grad * grad; param -= lr * grad / (sqrt(h) + eps);''', 'adagrad') AdaGradRule._kernel(grad, self.hyperparam.lr, self.hyperparam.eps, param.data, self.state['h'])
def forward_gpu(self, gy): if self._used_cudnn: x, = self.apoolnd.get_retained_inputs() return self.apoolnd.backward_gpu((x.data, ), gy) n, c = self._in_shape[:2] dims = self._in_shape[2:] ys = gy[0].shape[2:] gx = cuda.cupy.empty(self._in_shape, self._in_dtype) coeff = 1. / functools.reduce(operator.mul, self.ksize) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate( self.ndim) cuda.elementwise(in_params, out_params, operation, name)( gy[0].reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (coeff, gx))) return gx,
def forward_gpu(self, inputs): x, = inputs gx = cuda.elementwise( 'T x, T alpha', 'T gx', 'gx = x >= 0 ? (T)1 : (T)(alpha * exp(x))', 'elu_bwd')( x, self.alpha) self.retain_inputs((0,)) self.retain_outputs((0,)) return gx,
def forward_gpu(self, inputs): self.retain_inputs((0, 1, 2)) x0, x1, gy = inputs return cuda.elementwise( 'T x0, T x1, T gy', 'T gx0, T gx1', ''' gx0 = gy / x1; gx1 = -gx0 * x0 / x1; ''', 'div_bwd')(x0, x1, gy)
def _forward_gpu_compute_indexes_again(self, inputs): x, ggx = inputs n, c, h, w = ggx.shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all) assert y_h > 0, 'Height in the output should be positive.' y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all) assert y_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x.dtype) cuda.elementwise( 'raw T in, raw T ggx, int32 h, int32 w, int32 out_h,' 'int32 out_w, int32 kh, int32 kw, int32 sy, int32 sx,' 'int32 ph, int32 pw', 'T out', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); T maxval = in[in_x_0 + w * (in_y_0 + h * c0)]; int argmax_y = in_y_0; int argmax_x = in_x_0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { float v = in[x + offset_y]; if (maxval < v) { argmax_y = y; argmax_x = x; } } } out = ggx[argmax_x + w * (argmax_y + h * c0)] ''', 'max_pool_grad_fwd_calc_indexes')(x.reduced_view(), ggx.reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, y) return y,
def forward_gpu(self, inputs): x, = inputs gx = cuda.elementwise( 'T x, T r, T ir, T r2', 'T gx', 'gx = x >= 0 ? (T) r*pow((r2*x+1),(ir-1)) : (T)(r * exp(r*x))', 'eru_bwd')( x, self.r, self.ir, self.r2) self.retain_inputs((0,)) self.retain_outputs((0,)) return gx,
def forward_gpu(self, inputs): x, = inputs gx = cuda.elementwise( 'T x, T r, T ir, T r2', 'T gx', 'gx = r * pow((r2*abs(x) + 1),(ir - 1))', 'oru_bwd')( x, self.r, self.ir, self.r2) self.retain_inputs((0,)) self.retain_outputs((0,)) return gx,
def forward_gpu(self, inputs): x = inputs[0] b, c, h, w = x.shape y = cuda.cupy.empty_like(x) cuda.elementwise( 'raw T x, int32 c, int32 h, int32 w,' 'int32 kh, int32 kw,' 'int32 dy, int32 dx', 'T y', ''' int b0 = i / (c * h * w); int rest = i % (c * h * w); int c0 = rest / (h * w); rest %= h * w; int out_row = rest / w; int out_col = rest % w; int n_groups = kh * kw; int group_size = c / n_groups; int group_idx = c0 / group_size; // Make sure that center group is last if (group_idx == (n_groups - 1) / 2) { group_idx = n_groups - 1; } else if (group_idx == n_groups - 1) { group_idx = (n_groups - 1) / 2; } int ky = (group_idx / kw) - kh / 2; int kx = (group_idx % kw) - kw / 2; if (group_idx >= n_groups) { ky = 0; kx = 0; } int in_row = -ky * dy + out_row; int in_col = -kx * dx + out_col; if (in_row >= 0 && in_row < h && in_col >= 0 && in_col < w) { y = x[b0 * c * h * w + c0 * h * w + in_row * w + in_col]; } else { y = 0; } ''', 'shift_gpu')(x, c, h, w, self.kh, self.kw, self.dy, self.dx, y) return y,
def col2im_nd_gpu(col, stride, pad, dims): n, c = col.shape[:2] # (n, c, k_1, ..., k_N, out_1, ..., out_N) mid = (len(col.shape) - 2) // 2 + 2 ksize = col.shape[2:mid] outs = col.shape[mid:] ndim = len(dims) assert len(outs) == len(ksize) == len(stride) == len(pad) == ndim img_shape = (n, c) + dims # (n, c, d_1, d_2, ..., d_N) img = cuda.cupy.empty(img_shape, dtype=col.dtype) in_params, out_params, operation, name = \ conv_nd_kernel.Col2imNDKernel.generate(ndim) cuda.elementwise(in_params, out_params, operation, name)(col.reduced_view(), *(dims + outs + ksize + stride + pad + (img, ))) return img
def forward_gpu(self, inputs): if chainer.should_use_cudnn('>=auto') and 2 <= self.ndim <= 3: # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. return self.forward_cudnn(inputs) ndim = self.ndim ksize = self.ksize stride = self.stride pad = self.pad pad_value = self.pad_value cover_all = self.cover_all x, = inputs in_shape = x.shape in_dtype = x.dtype n, c = in_shape[:2] idims = in_shape[2:] odims = tuple( conv.get_conv_outsize(d, k, s, p, cover_all=cover_all) for (d, k, s, p) in six.moves.zip(idims, ksize, stride, pad)) # (n, c, y_1, y_2, ..., y_N) y_shape = (n, c) + odims y = cuda.cupy.empty(y_shape, dtype=x.dtype) if pad_value is None: coeff = self._get_pooling_width(cuda.cupy, idims, x.dtype) coeff = cuda.cupy.reciprocal(coeff, out=coeff) else: assert pad_value == 0 coeff = 1. / functools.reduce(operator.mul, ksize) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelForward.generate( ndim) cuda.elementwise(in_params, out_params, operation, name)(x.reduced_view(), *(idims + odims + ksize + stride + pad + (coeff, y))) self.coeff = coeff self._in_shape = in_shape self._in_dtype = in_dtype return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) y, gy = inputs if (chainer.should_use_cudnn('==always') and self.x is not None and gy.flags.c_contiguous): gx = cudnn.activation_backward(self.x, y, gy, _mode) else: gx = cuda.elementwise('T y, T gy', 'T gx', 'gx = gy * (1 - y * y)', 'tanh_bwd')(y, gy) return gx,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self._coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')( y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) return gx, None