def update_core_gpu(self, param): grad = param.grad if grad is None: return cuda.elementwise('T grad, T lr', 'T param', 'param -= lr * grad', 'sgd')(grad, self.hyperparam.lr, param.data)
def forward_gpu(self, x): if (cuda.cudnn_enabled and self.use_cudnn and pooling_nd._check_cudnn_acceptable_type(x[0].dtype)): # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. if _cudnn_version >= 3000 and self.ndim >= 2: return super(AveragePoolingND, self).forward_gpu(x) # With cuDNN v2, use cuDNN implementation only for inputs with # spatial dimensions of two. elif self.ndim == 2: return super(AveragePoolingND, self).forward_gpu(x) n, c = x[0].shape[:2] dims = x[0].shape[2:] ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in six.moves.zip( dims, self.ksize, self.stride, self.pad)) # (n, c, y_1, y_2, ..., y_N) y_shape = (n, c) + ys y = cuda.cupy.empty(y_shape, dtype=x[0].dtype) coeff = 1. / functools.reduce(operator.mul, self.ksize) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelForward.generate( self.ndim) cuda.elementwise(in_params, out_params, operation, name)( x[0].reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (coeff, y))) return y,
def backward_gpu(self, x, gy): if (cuda.cudnn_enabled and self.use_cudnn and pooling_nd._check_cudnn_acceptable_type(x[0].dtype)): # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. if _cudnn_version >= 3000 and self.ndim >= 2: return super(AveragePoolingND, self).backward_gpu(x, gy) # With cuDNN v2, use cuDNN implementation only for inputs with # spatial dimensions of two. elif self.ndim == 2: return super(AveragePoolingND, self).backward_gpu(x, gy) n, c = x[0].shape[:2] dims = x[0].shape[2:] ys = gy[0].shape[2:] gx = cuda.cupy.empty_like(x[0]) coeff = 1. / functools.reduce(operator.mul, self.ksize) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate( self.ndim) cuda.elementwise(in_params, out_params, operation, name)( gy[0].reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (coeff, gx))) return gx,
def update_one_gpu(self, param, grad, v): cuda.elementwise( '''float* param, const float* grad, float* v, float lr, float momentum''', '''v[i] = momentum * v[i] - lr * grad[i]; param[i] += v[i];''', 'momentum_sgd')(param, grad, v, self.lr, self.momentum)
def backward_gpu(self, x, gy): if self._used_cudnn: return super(AveragePooling2D, self).backward_gpu(x, gy) n, c, h, w = self._in_shape y_h, y_w = gy[0].shape[2:] gx = cuda.cupy.empty(self._in_shape, self._in_dtype) coeff = 1. / (self.kh * self.kw) cuda.elementwise( 'raw T gy, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T gx', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); int hc0 = out_h * c0; T val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { val = val + gy[out_x + out_w * (out_y + hc0)]; } } gx = val * coeff; ''', 'avg_pool_bwd')(gy[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff, gx) return gx,
def _cu_conv_sum(y, x, n): # Convolutional sum # TODO(beam2d): Use scan computation rdim = x.size // (x.shape[0] * x.shape[1]) cuda.elementwise( 'float* y, const float* x, int rdim, int N, int n_', ''' int half_n = n_ / 2; int offset = i / rdim * N * rdim + i % rdim; float* xi = x + offset; float* yi = y + offset; float sum_part = 0; for (int j = 0; j < N + half_n; ++j) { if (j < N) { sum_part += xi[j * rdim]; } if (j >= n_) { sum_part -= xi[(j - n_) * rdim]; } if (j >= half_n) { yi[(j - half_n) * rdim] = sum_part; } } ''', 'lrn_conv_sum')(y, x, rdim, x.shape[1], n, range=slice(0, x.shape[0] * rdim, 1))
def forward_gpu(self, x): y = x[1].copy() cuda.elementwise( 'float* y, float* b, int nc', 'y[i] = b[i / nc] - y[i];', 'sub_bias')(y, x[0], x[1].shape[1]) return y,
def backward_gpu(self, x, gy): cuda.elementwise( 'T gy, int32 x, int32 n_out', 'raw T gW', 'atomicAdd(&gW[x * n_out + i % n_out], gy)', 'embed_id_bwd')( gy[0], x[0][:, numpy.newaxis], self.gW.shape[1], self.gW) return None,
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all) out_w = get_conv_outsize(w, kw, sx, pw, cover_all) col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( ''' float* col, const float* img, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky + out_y * sy - ph; int in_x = kx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col[i] = img[in_x + w * (in_y + h * c0)]; } else { col[i] = 0; } ''', 'im2col')(col, img, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw) return col
def col2im_gpu(col, sy, sx, ph, pw, h, w, dy=1, dx=1): n, c, kh, kw, out_h, out_w = col.shape img = cuda.cupy.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( "raw T col, int32 h, int32 w, int32 out_h, int32 out_w," "int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw," "int32 dx, int32 dy", "T img", """ int c0 = i / (h * w); int y = i / w % h; int x = i % w; T val = 0; for (int ky = 0; ky < kh; ++ky) { int out_y = (y + ph - ky * dy); if (0 > out_y || out_y >= out_h * sy) continue; if (out_y % sy != 0) continue; out_y /= sy; for (int kx = 0; kx < kw; ++kx) { int out_x = (x + pw - kx * dx); if (0 > out_x || out_x >= out_w * sx) continue; if (out_x % sx != 0) continue; out_x /= sx; int k = out_y + out_h * (kx + kw * (ky + kh * c0)); val = val + col[out_x + out_w * k]; } } img = val; """, "col2im", )(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dx, dy, img) return img
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False, dy=1, dx=1): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all, dy) assert out_h > 0, 'Height in the output should be positive.' out_w = get_conv_outsize(w, kw, sx, pw, cover_all, dx) assert out_w > 0, 'Width in the output should be positive.' col = cuda.cupy.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( 'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,' 'int32 dy, int32 dx', 'T col', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky * dy + out_y * sy - ph; int in_x = kx * dx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col = img[in_x + w * (in_y + h * c0)]; } else { col = 0; } ''', 'im2col')(img.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col) return col
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all) out_w = get_conv_outsize(w, kw, sx, pw, cover_all) col = cuda.cupy.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( 'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T col', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky + out_y * sy - ph; int in_x = kx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col = img[in_x + w * (in_y + h * c0)]; } else { col = 0; } ''', 'im2col')(img.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, col) return col
def forward(self, inputs): c_prev, x = inputs a, i, f, o = _extract_gates(x) batch = len(x) if isinstance(x, numpy.ndarray): self.a = numpy.tanh(a) self.i = _sigmoid(i) self.f = _sigmoid(f) self.o = _sigmoid(o) c_next = numpy.empty_like(c_prev) c_next[:batch] = self.a * self.i + self.f * c_prev[:batch] h = self.o * numpy.tanh(c_next[:batch]) else: c_next = cuda.cupy.empty_like(c_prev) h = cuda.cupy.empty_like(c_next[:batch]) cuda.elementwise( 'T c_prev, T a, T i_, T f, T o', 'T c, T h', ''' COMMON_ROUTINE; c = aa * ai + af * c_prev; h = ao * tanh(c); ''', 'lstm_fwd', preamble=_preamble)( c_prev[:batch], a, i, f, o, c_next[:batch], h) c_next[batch:] = c_prev[batch:] self.c = c_next[:batch] return c_next, h
def col2im_gpu(col, sy, sx, ph, pw, h, w): n, c, kh, kw, out_h, out_w = col.shape img = cuda.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( ''' float* img, const float* col, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; val += col[out_x + out_w * (out_y + out_h * (kx + kw * (ky + kh * c0)))]; } } img[i] = val; ''', 'col2im')(img, col, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw) return img
def col2im_gpu(col, sy, sx, ph, pw, h, w): n, c, kh, kw, out_h, out_w = col.shape img = cuda.cupy.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( 'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T img', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); T val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; int k = out_y + out_h * (kx + kw * (ky + kh * c0)); val = val + col[out_x + out_w * k]; } } img = val; ''', 'col2im')(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, img) return img
def backward_gpu(self, inputs, grad_outputs): gx = cuda.empty_like(inputs[0]) cuda.elementwise( 'float* y, const float* b, const int n_channel', 'y[i] = b[i / n_channel]', 'sum_axis_bwd')(gx, grad_outputs[0], gx.shape[1]) return gx,
def forward_gpu(self, x): if cudnn.enabled and self.use_cudnn: return super(AveragePooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) coeff = 1. / (self.kh * self.kw) cuda.elementwise( ''' float* out, const float* in, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw, float coeff ''', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float val = 0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { val += in[x + offset_y]; } } out[i] = val * coeff; ''', 'avg_pool_fwd')(y, x[0], h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff) return y,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: return super(AveragePooling2D, self).backward_gpu(x, gy) n, c, h, w = x[0].shape y_h, y_w = gy[0].shape[2:] gx = cuda.empty_like(x[0]) coeff = 1. / (self.kh * self.kw) cuda.elementwise( ''' float* gx, const float* gy, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw, float coeff ''', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); int hc0 = out_h * c0; float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { val += gy[out_x + out_w * (out_y + hc0)]; } } gx[i] = val * coeff; ''', 'avg_pool_bwd')(gx, gy[0], h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff) return gx,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr( self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def backward_gpu(self, x, gy): gx0 = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx0, const float* x0, const float* gy', 'gx0[i] = ((x0[i] > 0) - (x0[i] < 0)) * gy[i]', 'abs_bwd')(gx0, x[0], gy[0]) return gx0,
def backward(self, inputs, grad_outputs): gy = grad_outputs[0] x = _as_mat(inputs[0]) W = inputs[1] xp = cuda.get_array_module(*inputs) # gradient of z = xW + b gz = xp.zeros((gy.shape[0], W.shape[1], gy.shape[1]), x.dtype) if xp == numpy: idx0 = xp.arange(len(gy))[:, None] idx1 = xp.arange(gy.shape[1]) gz[idx0, self.argmax, idx1] = gy else: gz_r = xp.rollaxis(gz, 1) cuda.elementwise( 'T gy, S argmax, int32 n', 'raw T gz', 'gz[argmax * n + i] = gy', 'maxout_bwd' )(gy, self.argmax, gz_r.size // len(gz_r), gz_r) gx = xp.tensordot(gz, W, ((1, 2), (1, 2))).reshape(inputs[0].shape) gW = xp.tensordot(x, gz, (0, 0)) if len(inputs) == 3: gb = gz.sum(axis=0) return gx, gW, gb else: return gx, gW
def forward_gpu(self, x): if (chainer.should_use_cudnn('>=auto') and pooling_nd._check_cudnn_acceptable_type(x[0].dtype)): # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. if _cudnn_version >= 3000 and self.ndim >= 2: return super(MaxPoolingND, self).forward_gpu(x) # With cuDNN v2, use cuDNN implementation only for inputs with # spatial dimensions of two. elif self.ndim == 2: return super(MaxPoolingND, self).forward_gpu(x) self.retain_inputs(()) self._in_shape = x[0].shape self._in_dtype = x[0].dtype n, c = x[0].shape[:2] dims = x[0].shape[2:] ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, self.cover_all) for (d, k, s, p) in six.moves.zip( dims, self.ksize, self.stride, self.pad)) # (n, c, y_1, y_2, ..., y_N) y_shape = (n, c) + ys y = cuda.cupy.empty(y_shape, dtype=x[0].dtype) self.indexes = cuda.cupy.empty(y_shape, dtype=numpy.int32) in_params, out_params, operation, name = \ max_pooling_nd_kernel.MaxPoolingNDKernelForward.generate(self.ndim) cuda.elementwise(in_params, out_params, operation, name)( x[0].reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (y, self.indexes))) return y,
def forward_gpu(self, inputs): if self._used_cudnn: x, = self.mpool2d.get_retained_inputs() return self._forward_gpu_compute_indexes_again((x.data, inputs[0])) else: x, = inputs n, c, h, w = x.shape y_h = conv.get_conv_outsize( h, self.kh, self.sy, self.ph, self.cover_all) assert y_h > 0, 'Height in the output should be positive.' y_w = conv.get_conv_outsize( w, self.kw, self.sx, self.pw, self.cover_all) assert y_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x.dtype) cuda.elementwise( 'raw T in, raw S indexes, int32 h, int32 w, int32 out_h,' 'int32 out_w, int32 kh, int32 kw, int32 sy, int32 sx,' 'int32 ph, int32 pw', 'T out', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int index = indexes[i]; int max_y = max(0, out_y * sy - ph + index / kw); int max_x = max(0, out_x * sx - pw + index % kw); out = in[max_x + w * (max_y + h * c0)]; ''', 'max_pool_grad_fwd')( x.reduced_view(), self.indexes.reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, y) return y,
def backward(self, inputs, grad_outputs): xp = cuda.get_array_module(*inputs) x, W = inputs gy = grad_outputs[0] gW = xp.zeros_like(W) if xp is numpy: # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is # too slow. for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)): if ix == self.ignore_label: continue gW[ix] += igy else: if self.ignore_label is None: cuda.elementwise( 'T gy, int32 x, int32 n_out', 'raw T gW', 'int w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy)', 'embed_id_bwd')( gy, xp.expand_dims(x, -1), gW.shape[1], gW) else: cuda.elementwise( 'T gy, int32 x, int32 n_out, int32 ignore', 'raw T gW', ''' if (x != ignore) { int w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy); } ''', 'embed_id_bwd_ignore_label')( gy, xp.expand_dims(x, -1), gW.shape[1], self.ignore_label, gW) return None, gW
def forward_gpu(self, x): if chainer.should_use_cudnn('>=auto') and 2 <= self.ndim <= 3: # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. return super(AveragePoolingND, self).forward_gpu(x) self.retain_inputs(()) self._in_shape = x[0].shape self._in_dtype = x[0].dtype n, c = x[0].shape[:2] dims = x[0].shape[2:] ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in six.moves.zip( dims, self.ksize, self.stride, self.pad)) # (n, c, y_1, y_2, ..., y_N) y_shape = (n, c) + ys y = cuda.cupy.empty(y_shape, dtype=x[0].dtype) coeff = 1. / functools.reduce(operator.mul, self.ksize) in_params, out_params, operation, name = \ average_pooling_nd_kernel.AveragePoolingNDKernelForward.generate( self.ndim) cuda.elementwise(in_params, out_params, operation, name)( x[0].reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (coeff, y))) return y,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx, const float* x, const float* gy, float value', 'gx[i] = value * __powf(x[i], value - 1) * gy[i]', 'pow_var_const_bwd')(gx, x[0], gy[0], self.value) return gx,
def forward_gpu(self, x): y = cuda.empty((x[0].size, self.W.shape[1]), dtype=numpy.float32) cuda.elementwise( 'float* y, const float* W, const int* x, int n_out', 'y[i] = W[x[i / n_out] * n_out + i % n_out]', 'embed_id_fwd')(y, self.W, x[0], self.W.shape[1]) return y,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if hasattr(self, 'y'): y = self.y else: y = log_softmax._log_softmax(x, self.use_cudnn) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) coeff = gloss * self._coeff if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, raw T coeff, S n_channel, S n_unit', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t))); ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) else: gx = cuda.elementwise( 'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == -1 ? 0 : coeff[0] * (y - (c == t)) * w[t]; ''', 'softmax_crossent_bwd')( y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) return gx, None
def update_one_gpu(self, param, grad, ms): cuda.elementwise( '''float* param, const float* grad, float* ms, float lr, float alpha, float eps''', '''ms[i] = alpha * ms[i] + (1 - alpha) * grad[i] * grad[i]; param[i] -= lr * grad[i] / (sqrtf(ms[i]) + eps);''', 'rmsprop')(param, grad, ms, self.lr, self.alpha, self.eps)
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx, const float* x, const float* gy, float value', 'gx[i] = -value * gy[i] / (x[i] * x[i])', 'div_from_const_bwd')(gx, x[0], gy[0], self.value) return gx,
def backward_gpu(self, x, gy): if cuda.cudnn_enabled and self.use_cudnn: return super(MaxPooling2D, self).backward_gpu(x, gy) n, c, h, w = x[0].shape y_h, y_w = gy[0].shape[2:] gx = cuda.empty_like(x[0]) cuda.elementwise( 'raw T gy, raw S indexes, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw', 'T gx', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; int offset = out_x + out_w * (out_y + out_h * c0); if (indexes[offset] == kx + kw * ky) { val += gy[offset]; } } } gx = val; ''', 'max_pool_bwd')(gy[0].reduced_view(), self.indexes.reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, gx) return gx,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t, W = inputs gloss, = grad_outputs n_in = x.shape[1] gx = cupy.zeros_like(x) gW = cupy.zeros_like(W) cuda.elementwise( '''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, raw T gloss, int32 c, int32 max_length''', 'raw T gx, raw T gw', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T code = codes[p]; T g = -gloss[0] * code / (1.0 + exp(wxy)); for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; atomicAdd(&gx[x_ind], g * w[w_ind]); atomicAdd(&gw[w_ind], g * x[x_ind]); } } ''', 'binary_hierarchical_softmax_bwd')(self.wxy, x, W, t, self.paths, self.codes, self.begins, gloss, n_in, self.max_length, gx, gW) return gx, None, gW
def forward_gpu(self, inputs): x, t = inputs n_in = x.shape[1] self._make_samples(t) self.wx = cuda.elementwise( 'raw T W, raw T x, S k, int32 c, int32 m', 'T wx', ''' T f = 0; for (int j = 0; j < c; ++j) { f += x[(i / m) * c + j] * W[k * c + j]; } wx = f; ''', 'negative_sampling_wx' )(self.W, x, self.samples, n_in, self.sample_size + 1) y = cuda.elementwise( 'T wx, int32 c, int32 m', 'T y', ''' T f = wx; if (i % m == 0) { f = -f; } T loss; if (f < 0) { loss = __logf(1 + __expf(f)); } else { loss = f + __logf(1 + __expf(-f)); } y = loss; ''', 'negative_sampling_forward' )(self.wx, n_in, self.sample_size + 1) # TODO(okuta): merge elementwise loss = cuda.cupy.sum(y) return loss,
def backward_gpu(self, inputs, loss): x, t = inputs gloss, = loss n_in = x.shape[1] gx = cuda.zeros_like(x) cuda.elementwise( '''const float* wxy, float* gx, float* gw, const float* x, const float* w, const int* ts, const int* paths, const float* codes, const int* begins, const float* gloss, int c, int max_length''', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; float code = codes[p]; gx = &gx[ind * c]; x = &x[ind * c]; float g = -*gloss * code / (1.0 + exp(wxy[i])); for (int j = 0; j < c; ++j) { atomicAdd(gx + j, g * w[node * c + j]); atomicAdd(gw + node * c + j, g * x[j]); } } ''', 'binary_hierarchical_softmax_bwd' )(self.wxy, gx, self.gW, x, self.W, t, self.paths, self.codes, self.begins, gloss, n_in, self.max_length) return gx, None
def forward(self, inputs): self.retain_inputs((0, 1, 2)) x, gamma, gy = inputs expander = self.expander inv_m = gamma.dtype.type(1. / (x.size // gamma.size)) xp = cuda.get_array_module(x) if self.use_cudnn: cudnn_mode = self.mode.get_cudnn_mode() x = cuda.cupy.ascontiguousarray(x) gamma = cuda.cupy.ascontiguousarray(gamma) gy = cuda.cupy.ascontiguousarray(gy) dtype = x.dtype handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(_as4darray(x)) derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor() libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value, x_desc.value, cudnn_mode) one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes gx = cuda.cupy.empty_like(x) ggamma = cuda.cupy.empty_like(gamma) gbeta = cuda.cupy.empty_like(gamma) libcudnn.batchNormalizationBackward( handle, cudnn_mode, one.data, zero.data, one.data, zero.data, x_desc.value, x.data.ptr, x_desc.value, gy.data.ptr, x_desc.value, gx.data.ptr, derivedBnDesc.value, gamma.data.ptr, ggamma.data.ptr, gbeta.data.ptr, self.eps, self.mean.data.ptr, self.inv_std.data.ptr) else: gbeta = gy.sum(axis=self.axis) x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander]) ggamma = (gy * x_hat).sum(axis=self.axis) if xp is numpy: gx = (gamma * self.inv_std)[expander] * ( gy - (x_hat * ggamma[expander] + gbeta[expander]) * inv_m) else: gx = cuda.elementwise( ''' T gy, T x_hat, T gamma, T inv_std, T ggamma, T gbeta, T inv_m ''', 'T gx', ''' gx = (gamma * inv_std) * ( gy - (x_hat * ggamma + gbeta) * inv_m) ''', 'bn_bwd')(gy, x_hat, gamma[expander], self.inv_std[expander], ggamma[expander], gbeta[expander], inv_m) self.retain_outputs((0, 1)) return gx, ggamma, gbeta
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def forward_gpu(self, inputs): x, W, gy = inputs masked = cuda.elementwise('T x, T cond, T gy', 'T masked', 'masked = cond >= 0 ? (T)0 : (T)(x * gy)', 'prelu_masked')(x, self.cond, gy) if self.reduce_axes is None: gW = masked.copy() else: gW = masked.sum(axis=self.reduce_axes) gx = masked # reuse buffer _fwd_kern()(gy, self.cond, W.reshape(self.extended_shape), gx) self.retain_inputs((0, 1, 2)) return gx, gW
def backward(self, inputs, grads): c_prev, x, u = inputs gc, gh = grads if gc is None: gc = 0 if gh is None: gh = 0 n_unit = x.shape[1] x_bar = u[:, 0:n_unit] f_in = u[:, n_unit:n_unit * 2] r_in = u[:, n_unit * 2:] gc_prev = cuda.cupy.empty_like(c_prev) gx = cuda.cupy.empty_like(x) gu = cuda.cupy.empty_like(u) gx_bar = gu[:, 0:n_unit] gf_in = gu[:, n_unit:n_unit * 2] gr_in = gu[:, n_unit * 2:] cuda.elementwise( '''T c_prev, T x, T x_bar, T f_in, T r_in, int32 n_unit, T gc, T gh, T c''', 'T gc_prev, T gx, T gx_bar, T gf_in, T gr_in', ''' float f = sigmoid(f_in); float r = sigmoid(r_in); gx = gh * (1 - r); float tanh_c = tanh(c); float g = gh * r * grad_tanh(tanh_c) + gc; gc_prev = g * f; gx_bar = g * (1 - f); gf_in = g * grad_sigmoid(f) * (c_prev - x_bar); gr_in = gh * grad_sigmoid(r) * (tanh_c - x); ''', 'sru_backward', preamble=_preamble)(c_prev, x, x_bar, f_in, r_in, n_unit, gc, gh, self.c, gc_prev, gx, gx_bar, gf_in, gr_in) return gc_prev, gx, gu
def backward_gpu(self, x, gy): if (cuda.cudnn_enabled and self.use_cudnn and pooling_nd._check_cudnn_acceptable_type(x[0].dtype)): # With cuDNN v3 or greater, use cuDNN implementation for inputs # with spatial dimensions of two or more. if _cudnn_version >= 3000 and self.ndim >= 2: return super(MaxPoolingND, self).backward_gpu(x, gy) # With cuDNN v2, use cuDNN implementation only for inputs with # spatial dimensions of two. elif self.ndim == 2: return super(MaxPoolingND, self).backward_gpu(x, gy) n, c = x[0].shape[:2] dims = x[0].shape[2:] ys = gy[0].shape[2:] gx = cuda.cupy.empty_like(x[0]) ndim = self.ndim in_params, out_params, operation, name = \ max_pooling_nd_kernel.MaxPoolingNDKernelBackward.generate(ndim) cuda.elementwise(in_params, out_params, operation, name)( gy[0].reduced_view(), self.indexes.reduced_view(), *(dims + ys + self.ksize + self.stride + self.pad + (gx, ))) return gx,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs gloss = grad_outputs[0] n_unit = t.size // len(t) coeff = gloss * self._coeff gx = cuda.elementwise( 'T y, S t, raw T coeff, S n_channel, S n_unit, raw T weights', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = ((t == -1) || (c != t)) ? 0 : ((weights[t]*coeff[0]) / max(y, 1e-5)); ''', 'crossent_bwd')(self.y, cupy.expand_dims(t, 1), -coeff, x.shape[1], n_unit, self.weights.reduced_view()) return gx, None
def _cu_conv_sum(y, x, n): # Convolutional sum # TODO(beam2d): Use scan computation rdim = x.size // (x.shape[0] * x.shape[1]) cuda.elementwise( 'raw T x, int32 rdim, int32 N, int32 n_', 'raw T y', ''' int half_n = n_ / 2; int offset = i / rdim * N * rdim + i % rdim; float sum_part = 0; for (int j = 0; j < N + half_n; ++j) { if (j < N) { sum_part += x[offset + j * rdim]; } if (j >= n_) { sum_part -= x[offset + (j - n_) * rdim]; } if (j >= half_n) { y[offset + (j - half_n) * rdim] = sum_part; } } ''', 'lrn_conv_sum')(x, rdim, x.shape[1], n, y, size=x.shape[0] * rdim)
def backward(self, inputs, grads): xp = cuda.get_array_module(*inputs) _, indices, _ = inputs g = grads[0] if xp is numpy: gv = g[range(len(indices)), indices] g[range(len(indices)), indices] = 0 else: dim = g.shape[2] shape = (indices.shape[0], dim) gv = cuda.cupy.empty(shape, g.dtype) cuda.elementwise( 'S t, int32 d', 'raw T s, T y', ''' int b = i / d; int k = i - b * d; int ind[] = {b, t, k}; y = s[ind]; s[ind] = 0; ''', 'thin_stack_set_bwd' )(indices[:, None], dim, g, gv) return g, None, gv
def sample_gpu(self, shape): ps = cuda.cupy.random.uniform(size=shape, dtype=numpy.float32) vs = cuda.elementwise( 'T ps, raw T threshold , raw S values, int32 b', 'int32 vs', ''' T pb = ps * b; int index = __float2int_rd(pb); // fill_uniform sometimes returns 1.0, so we need to check index if (index >= b) { index = 0; } int lr = threshold[index] < pb - index; vs = values[index * 2 + lr]; ''', 'walker_alias_sample')(ps, self.threshold, self.values, len(self.threshold)) return vs
def forward_gpu(self, x): if cuda.cudnn_enabled and self.use_cudnn: return super(AveragePooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) y = cuda.cupy.empty((n, c, y_h, y_w), dtype=numpy.float32) coeff = 1. / (self.kh * self.kw) cuda.elementwise( 'raw T in, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T out', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float val = 0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { val += in[x + offset_y]; } } out = val * coeff; ''', 'avg_pool_fwd')(x[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff, y) return y,
def __call__(self, opt): if cuda.available: kernel = cuda.elementwise('T s, T decay', 'T g', 'g += decay * s', 'lasso') rate = self.rate for param in opt.target.params(): p, g = param.data, param.grad xp = cuda.get_array_module(p) sign = xp.sign(p) with cuda.get_device(p) as dev: if int(dev) == -1: g += rate * sign else: kernel(sign, rate, g)
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if hasattr(self, 'y'): y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self._coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')(y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) return gx, None
def backward_gpu(self, x, gy): if (cuda.cudnn_enabled and self.use_cudnn and pooling_2d._check_cudnn_acceptable_type(x[0].dtype)): return super(AveragePooling2D, self).backward_gpu(x, gy) n, c, h, w = x[0].shape y_h, y_w = gy[0].shape[2:] gx = cuda.cupy.empty_like(x[0]) coeff = 1. / (self.kh * self.kw) cuda.elementwise( 'raw T gy, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T gx', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); int hc0 = out_h * c0; T val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { val = val + gy[out_x + out_w * (out_y + hc0)]; } } gx = val * coeff; ''', 'avg_pool_bwd')(gy[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff, gx) return gx,
def backward_gpu(self, x, gys): gx = cuda.zeros_like(x[0]) coffset = 0 kernel = cuda.elementwise(_args, 'COPY(x[idx] = y[i])', 'split_bwd', preamble=_preamble) for gy in gys: if gy is None: continue cdimy = gy.shape[self.axis] if cdimy != 0: kernel(gy, gx, cdimy, self.cdimx, self.rdim, coffset) coffset += cdimy return gx,
def backward_gpu(self, inputs, grad_outputs): x, y, z = inputs gw, = grad_outputs gx, gy = cuda.elementwise( 'T x, T y, T gw', 'T gx, T gy', ''' gx = y * gw; gy = x * gw; ''', 'muladd_bwd')(x, y, gw) gz = gw return gx, gy, gz
def __call__(self, opt): if cuda.available: kernel = cuda.elementwise( 'T low, T high', 'T p', 'p = (p < low) ? low : (p > high) ? high : p', 'weight_clip') for param in opt.target.params(): p = param.data with cuda.get_device(p) as dev: if int(dev) == -1: numpy.clip(p, self.low, self.high) else: kernel(self.low, self.high, p)
def forward_gpu(self, inputs): x, gamma, beta = inputs mean = x.mean(axis=(0, 1), keepdims=True) var = x.var(axis=(0, 1), keepdims=True) + self.eps normalize = cuda.elementwise( 'T x, T var, T mean, T gamma, T beta', 'T std, T x_hat, T y', 'std = sqrtf(var);' 'x_hat = (x - mean) / std;' 'y = gamma * x_hat + beta;', 'normalize') self.std, self.x_hat, y = normalize(x, var, mean, gamma, beta) return y,
def backward_gpu(self, x, gy): if cuda.cudnn_enabled and self.use_cudnn: gx = cuda.empty_like(x[0]) handle = cudnn.get_handle() desc = cudnn.create_tensor_descriptor(_as4darray(self.y)) libcudnn.activationBackward(handle, _mode, ctypes.c_float(1), desc.value, self.y.data.ptr, desc.value, gy[0].data.ptr, desc.value, x[0].data.ptr, ctypes.c_float(0), desc.value, gx.data.ptr) else: gx = cuda.elementwise('T x, T gy', 'T gx', 'gx = x > 0 ? gy : 0', 'relu_bwd')(x[0], gy[0]) return gx,
def __call__(self, opt): if cuda.available: kernel = cuda.elementwise('T p, T decay', 'T g', 'g += decay * p', 'weight_decay') rate = self.rate for name, param in opt.target.namedparams(): if name == 'b' or name.endswith('/b'): continue p, g = param.data, param.grad with cuda.get_device(p) as dev: if int(dev) == -1: g += rate * p else: kernel(p, rate, g)
def backward_gpu(self, x, gy): df0 = x[0] df1 = x[1] v = x[2] gx0, gx1 = cuda.elementwise( 'T df_plus, T df_minus, T v, T g', 'T gx0, T gx1', ''' if(v>0){ gx0 = g; gx1 = 0; }else{ gx0 = 0; gx1 = g; } ''', 'upwind_b')(df0, df1, v, gy[0]) return gx0, gx1, None
def backward_gpu(self, inputs, gy): W, log_sigma2 = inputs gy = gy[0] gW, gs = cuda.elementwise( 'T W, T ls, T gy, T eps, T lo_th, T up_th', 'T gW, T gs', ''' T square_W = W * W + eps; T y = ls - log(square_W); gs = ((y > lo_th) & (y < up_th))? gy : (T)0; gW = - gs / square_W * 2 * W; ''', 'log_alpha_bwd')( W, log_sigma2, gy, self.eps, self.lower_threshold, self.upper_threshold) return gW, gs
def _create_reduction_kernel(shape0, expr1, expr2): return cuda.elementwise( ''' float* ret1, float* ret2, const float* x, const float* y, float alpha, int shape12 ''', ''' float sum1 = 0, sum2 = 0; for (int j = 0; j < {0}; j++) {{ int I = j * shape12 + i; sum1 += {1}; sum2 += {2}; }} ret1[i] = sum1 * alpha; ret2[i] = sum2 * alpha; '''.format(shape0, expr1, expr2), 'bn_asix02')
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32) c = x[0].shape[1] cuda.elementwise( 'float* maxes, const float* x, int c', ''' const float* row = x + i * c; float maxval = row[0]; for (int j = 1; j < c; ++j) { if (maxval < row[j]) { maxval = row[j]; } } maxes[i] = maxval; ''', 'softmax_rowmax')(maxes, x[0], c) cuda.elementwise( 'float* y, const float* x, const float* maxes, int c', 'y[i] = __expf(x[i] - maxes[i / c])', 'softmax_exp')(y, x[0], maxes, c) coeff = maxes # reuse memory cuda.elementwise( 'float* coeff, const float* y, int c', ''' const float* row = y + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } coeff[i] = 1 / sum; ''', 'softmax_invrowsum')(coeff, y, c) cuda.elementwise( 'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]', 'softmax_rowmul')(y, coeff, c) self.y = y return y,
def backward_gpu(self, inputs, grad_outputs): xp = cuda.get_array_module(*inputs) t, gloss = inputs[1], grad_outputs[0] self.bottom_diff = cuda.elementwise('S t, int32 dim', 'raw T bottom_diff', 'bottom_diff[i * dim + t] *= -1', 'hinge_bwd')(t, inputs[0].shape[1], self.bottom_diff) if self.norm == 'L1': gx = (gloss / t.shape[0]) * xp.sign(self.bottom_diff) elif self.norm == 'L2': gx = (2 * gloss / t.shape[0]) * self.bottom_diff else: raise NotImplementedError() return gx, None
def forward_gpu(self, inputs): log_alpha = inputs[0] reg = cuda.elementwise( 'T la, T clip', 'T reg', ''' const T half = 0.5; const T c063576 = 0.63576; reg = (c063576 * (tanh(((T)1.87320 + (T)1.48695 * la) * half) * half + half) - half * log1p(exp(-la)) - c063576) * ((T)1.0 - clip); ''', 'kl_fwd')( log_alpha, self.clip_mask) reg = utils.force_array(- reg.sum() / log_alpha.size, log_alpha.dtype) return reg,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs gloss = grad_outputs[0] n_unit = x.size // (x.shape[0] * x.shape[1]) if getattr(self, 'normalize', True): count = x.shape[0] * n_unit else: count = x.shape[0] coeff = cuda.cupy.divide(gloss, count, dtype=gloss.dtype) gx = cuda.elementwise( 'T y, S t, raw T coeff, S n_channel, S n_unit', 'T gx', 'gx = coeff[0] * (y - (t == (i / n_unit % n_channel)))', 'softmax_crossent_bwd')(self.y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) return gx, None