def forward_gpu(self, x): n, c, h, w = x[0].shape out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) out_c = self.W.shape[0] y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(self.W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.create_tensor_descriptor( self.b[None, :, None, None]) algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo) workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=self.dtype) one = ctypes.c_float(1) zero = ctypes.c_float(0) libcudnn.convolutionForward( handle, one, x_desc.value, x[0].data.ptr, self.filter_desc.value, self.W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if self.b is not None: libcudnn.addTensor( handle, libcudnn.CUDNN_ADD_SAME_C, one, self.bias_desc.value, self.b.data.ptr, one, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) y_mats = y.reshape(n, out_c, out_h * out_w) for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if self.b is not None: y += self.b.reshape((1, out_c, 1, 1)) return y,
def forward(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] xp = cuda.get_array_module(*x) if xp is numpy: self.col = conv.im2col_cpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) else: self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (D, C, KY, KX) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D) y = _matmul(c_, w_, xp).astype(x.dtype, copy=False) # (C, B*IY*IX, D) -> (B, C*D, IY, IX) y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \ .reshape((B, C * D, IY, IX)) if b is not None: y += b[None, :, None, None] return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == self.W_dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): # cuDNN implementation return self._forward_cudnn(x, gy) else: # Implementation using im2col _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape col = conv.im2col_gpu(x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW,
def backward_gpu(self, x, gy): xp = cuda.cupy gcol = conv.im2col_gpu(gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gcol = gcol.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = gcol.shape gcol = gcol.reshape((n, c, oy, ox, ky * kx)) indexes = xp.asarray(self.indexes, dtype=numpy.int32) gx = xp.empty((n, c, oy, ox), dtype=x[0].dtype) xp.ElementwiseKernel( "int32 indexes, raw float32 gcol, int32 n, int32 c, int32 oy," "int32 ox, int32 ky, int32 kx", "raw float32 gx", """ int ind_n = i / c / oy / ox; int ind_c = (i / oy / ox) % c; int ind_oy = (i / ox) % oy; int ind_ox = i % ox; int gcol_ky = indexes / kx; int gcol_kx = indexes % kx; float top_gx = gcol[ind_n * c * oy * ox * ky * kx + ind_c * oy * ox * ky * kx + ind_oy * ox * ky * kx + ind_ox * ky * kx + gcol_ky * kx + gcol_kx]; gx[ind_n * c * oy * ox + ind_c * oy * ox + ind_oy * ox + ind_ox] = top_gx; """, "upsampling_2d_bwd", )(indexes, gcol, n, c, oy, ox, ky, kx, gx) return (gx,)
def _forward_gpu_core(self, x, gy): col = conv.im2col_gpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3), (0, 4, 5)) ).astype(self.W_dtype, copy=False) return gW,
def forward_gpu(self, x): xp = cuda.cupy n, c, h, w = x[0].shape if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) up_y = xp.zeros((n, c, self.outh, self.outw), dtype=numpy.float32) up_y = conv.im2col_gpu(up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = up_y.shape indexes = xp.asarray(self.indexes, dtype=numpy.int32) xp.ElementwiseKernel( "int32 index, float32 x, int32 n, int32 c, int32 oy, int32 ox," "int32 ky, int32 kx", "raw float32 up_y", """ int yn = i / c / oy / ox; int yc = (i / oy / ox) % c; int yoy = (i / ox) % oy; int yox = i % ox; up_y[yn * c * oy * ox * ky * kx + yc * oy * ox * ky * kx + yoy * ox * ky * kx + yox * ky * kx + index] = x; """, "upsampling_2d_fwd", )(indexes, x[0], n, c, oy, ox, ky, kx, up_y) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return (up_y,)
def forward(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] xp = cuda.get_array_module(*x) if xp is numpy: self.col = conv.im2col_cpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw) else: self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (D, C, KY, KX) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D) y = _matmul(c_, w_, xp).astype(x.dtype, copy=False) # (C, B*IY*IX, D) -> (B, C*D, IY, IX) y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \ .reshape((B, C * D, IY, IX)) if b is not None: y += b[None, :, None, None] return y,
def forward(self, inputs): x, = inputs xp = cuda.get_array_module(x) if xp == numpy: y = im2col_cpu(x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) else: y = im2col_gpu(x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) n, c, kh, kw, out_h, out_w = y.shape y = y.reshape(n, c * kh * kw, out_h, out_w) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape if (self.cover_all or not chainer.should_use_cudnn('>=auto') or x.dtype != self.W_dtype): col = conv.im2col_gpu(x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW, gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype conv_desc = cudnn.create_convolution_descriptor(*conv_param) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = _get_algorithm_bwd_filter(x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def backward(self, x, gy): if isinstance(gy[0], cuda.ndarray): gcol = conv.im2col_gpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all ) else: gcol = conv.im2col_cpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all ) gx = gcol.sum(axis=(2, 3)) return (gx,)
def test_im2col_consistency(self): col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 2, 2, dy=2, dx=2) col_gpu = conv.im2col_gpu(cuda.to_gpu(self.x), 3, 3, 2, 2, 2, 2, dy=2, dx=2) testing.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
def _forward_gpu_core(self, x, W, b): kh, kw = W.shape[2:] # Implementation using im2col col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot( col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def forward(self, inputs): x, = inputs xp = cuda.get_array_module(x) if xp == numpy: y = im2col_cpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) else: y = im2col_gpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) n, c, kh, kw, out_h, out_w = y.shape y = y.reshape(n, c * kh * kw, out_h, out_w) return y,
def forward_gpu(self, x): self.retain_inputs(()) self._in_dtype = x[0].dtype xp = cuda.cupy n, c, h, w = x[0].shape if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) up_y = xp.zeros((n, c, self.outh, self.outw), dtype=numpy.float32) up_y = conv.im2col_gpu(up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = up_y.shape indexes = xp.asarray(self.indexes, dtype=numpy.int32) xp.ElementwiseKernel( 'int32 index, float32 x, int32 n, int32 c, int32 oy, int32 ox,' 'int32 ky, int32 kx', 'raw float32 up_y', ''' int yn = i / c / oy / ox; int yc = (i / oy / ox) % c; int yoy = (i / ox) % oy; int yox = i % ox; up_y[yn * c * oy * ox * ky * kx + yc * oy * ox * ky * kx + yoy * ox * ky * kx + yox * ky * kx + index] = x; ''', 'upsampling_2d_fwd')(indexes, x[0], n, c, oy, ox, ky, kx, up_y) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return up_y,
def check_forward(self, y): y = upsampling_2d.upsampling_2d( self.pooled_y, self.p.indexes, ksize=(self.p.kh, self.p.kw), stride=(self.p.sy, self.p.sx), pad=(self.p.ph, self.p.pw), outsize=self.in_shape[2:], cover_all=self.p.cover_all) if isinstance(y.data, numpy.ndarray): y = conv.im2col_cpu(y.data, self.p.kh, self.p.kw, self.p.sy, self.p.sx, self.p.ph, self.p.pw) else: y = conv.im2col_gpu(y.data, self.p.kh, self.p.kw, self.p.sy, self.p.sx, self.p.ph, self.p.pw) for i in numpy.ndindex(y.shape): n, c, ky, kx, oy, ox = i up_y = y[n, c, ky, kx, oy, ox] if ky * y.shape[3] + kx == self.p.indexes[n, c, oy, ox]: in_y = self.pooled_y.data[n, c, oy, ox] testing.assert_allclose(in_y, up_y) else: testing.assert_allclose(up_y, 0)
def check_forward(self, y): y = F.upsampling_2d( self.pooled_y, self.p.indexes, ksize=(self.p.kh, self.p.kw), stride=(self.p.sy, self.p.sx), pad=(self.p.ph, self.p.pw), outsize=self.in_shape[2:], cover_all=self.p.cover_all) if isinstance(y.data, numpy.ndarray): y = conv.im2col_cpu(y.data, self.p.kh, self.p.kw, self.p.sy, self.p.sx, self.p.ph, self.p.pw) else: y = conv.im2col_gpu(y.data, self.p.kh, self.p.kw, self.p.sy, self.p.sx, self.p.ph, self.p.pw) for i in numpy.ndindex(y.shape): n, c, ky, kx, oy, ox = i up_y = y[n, c, ky, kx, oy, ox] if ky * y.shape[3] + kx == self.p.indexes[n, c, oy, ox]: in_y = self.pooled_y.data[n, c, oy, ox] testing.assert_allclose(in_y, up_y) else: testing.assert_allclose(up_y, 0)
def forward(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] xp = cuda.get_array_module(*x) W = xp.where(W >= 0, 1, -1).astype(numpy.float32, copy=False) olen, ilen, hlen, wlen = W.shape if self.coeffs is None: self.coeffs = numpy.ones(ilen) coeffs = numpy.copy(self.coeffs) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 0) coeffs = numpy.broadcast_to(coeffs, W.shape) M = xp.asarray(coeffs, numpy.float32).reshape(W.shape) self.M = M W = self.M * W if xp is numpy: self.col = conv.im2col_cpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw) else: self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (D, C, KY, KX) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D) y = _matmul(c_, w_, xp).astype(x.dtype, copy=False) # (C, B*IY*IX, D) -> (B, C*D, IY, IX) y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \ .reshape((B, C * D, IY, IX)) if b is not None: y += b[None, :, None, None] return y,
def check_forward(self, y): y = F.upsampling_2d(self.pooled_y, self.indices, ksize=self.ksize, stride=self.stride, outsize=self.in_shape[2:]) if isinstance(y.array, numpy.ndarray): y = conv.im2col_cpu(y.array, self.ksize, self.ksize, self.stride, self.stride, 0, 0) else: y = conv.im2col_gpu(y.array, self.ksize, self.ksize, self.stride, self.stride, 0, 0) for i in numpy.ndindex(y.shape): n, c, ky, kx, oy, ox = i up_y = y[n, c, ky, kx, oy, ox] if ky * y.shape[3] + kx == self.indices[n, c, oy, ox]: in_y = self.pooled_y.array[n, c, oy, ox] testing.assert_allclose(in_y, up_y) else: testing.assert_allclose(up_y, 0)
def check_forward(self, y): y = F.upsampling_2d( self.pooled_y, self.indices, ksize=self.ksize, stride=self.stride, outsize=self.in_shape[2:]) if isinstance(y.array, numpy.ndarray): y = conv.im2col_cpu( y.array, self.ksize, self.ksize, self.stride, self.stride, 0, 0) else: y = conv.im2col_gpu( y.array, self.ksize, self.ksize, self.stride, self.stride, 0, 0) for i in numpy.ndindex(y.shape): n, c, ky, kx, oy, ox = i up_y = y[n, c, ky, kx, oy, ox] if ky * y.shape[3] + kx == self.indices[n, c, oy, ox]: in_y = self.pooled_y.array[n, c, oy, ox] testing.assert_allclose(in_y, up_y) else: testing.assert_allclose(up_y, 0)
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs out_c, _, kh, kw = W.shape n, _, h, w = x.shape out_h, out_w = self._get_out_size(inputs) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): # cuDNN implementation return self._forward_cudnn(x, W, b, y) else: # Implementation using im2col col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot(col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def backward(self, x, gy): if isinstance(gy[0], cuda.ndarray): gcol = conv.im2col_gpu(gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) else: gcol = conv.im2col_cpu(gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gx = gcol.sum(axis=(2, 3)) return gx,
def forward_gpu(self, gy): xp = cuda.cupy gcol = conv.im2col_gpu(gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gcol = gcol.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = gcol.shape gcol = gcol.reshape((n, c, oy, ox, ky * kx)) indexes = xp.asarray(self.indexes, dtype=numpy.int32) gx = xp.empty((n, c, oy, ox), dtype=self._in_dtype) cuda.elementwise( 'int32 indexes, raw T gcol, int32 n, int32 c, int32 oy,' 'int32 ox, int32 ky, int32 kx', 'raw T gx', ''' int ind_n = i / c / oy / ox; int ind_c = (i / oy / ox) % c; int ind_oy = (i / ox) % oy; int ind_ox = i % ox; int gcol_ky = indexes / kx; int gcol_kx = indexes % kx; float top_gx = gcol[ind_n * c * oy * ox * ky * kx + ind_c * oy * ox * ky * kx + ind_oy * ox * ky * kx + ind_ox * ky * kx + gcol_ky * kx + gcol_kx]; gx[ind_n * c * oy * ox + ind_c * oy * ox + ind_oy * ox + ind_ox] = top_gx; ''', 'upsampling_2d_bwd')(indexes, gcol, n, c, oy, ox, ky, kx, gx) return gx,
def forward_gpu(self, x): self.retain_inputs(()) self._in_dtype = x[0].dtype xp = cuda.cupy n, c, h, w = x[0].shape if self.outh is None: self.outh = conv.get_deconv_outsize( h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize( w, self.kw, self.sx, self.pw, cover_all=self.cover_all) up_y = xp.zeros((n, c, self.outh, self.outw), dtype=self._in_dtype) up_y = conv.im2col_gpu( up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = up_y.shape indexes = xp.asarray(self.indexes, dtype=numpy.int32) xp.ElementwiseKernel( 'int32 index, T x, int32 n, int32 c, int32 oy, int32 ox,' 'int32 ky, int32 kx', 'raw T up_y', ''' int yn = i / c / oy / ox; int yc = (i / oy / ox) % c; int yoy = (i / ox) % oy; int yox = i % ox; up_y[yn * c * oy * ox * ky * kx + yc * oy * ox * ky * kx + yoy * ox * ky * kx + yox * ky * kx + index] = x; ''', 'upsampling_2d_fwd')(indexes, x[0], n, c, oy, ox, ky, kx, up_y) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return up_y,
def forward_grad(self, inputs, rho): x, W = inputs[:2] xp = cuda.get_array_module(*inputs) rho = xp.array(rho, dtype=W.dtype) # regenerate input matrix _, _, kh, kw = W.shape if isinstance(x, cuda.ndarray): col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) else: col = conv.im2col_cpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) if rho.ndim > 0: # if rho is vector, prep for broadcasting rho = rho[:, xp.newaxis, xp.newaxis, xp.newaxis] gW = xp.tensordot(self.mask * rho, -1.0 * col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) # calc conscience factors [DeSieno, 1988] as bias of conv layer n = self.mask.shape[1] v = self.mask.sum(axis=(0, 2, 3), dtype=W.dtype) gb = -1.0 * self.conscience_factor * (1.0 / n - v / v.sum()) return gW, gb
def forward_gpu(self, gy): xp = cuda.cupy gcol = conv.im2col_gpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gcol = gcol.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = gcol.shape gcol = gcol.reshape((n, c, oy, ox, ky * kx)) indexes = xp.asarray(self.indexes, dtype=numpy.int32) gx = xp.empty((n, c, oy, ox), dtype=self._in_dtype) cuda.elementwise( 'int32 indexes, raw T gcol, int32 n, int32 c, int32 oy,' 'int32 ox, int32 ky, int32 kx', 'raw T gx', ''' int ind_n = i / c / oy / ox; int ind_c = (i / oy / ox) % c; int ind_oy = (i / ox) % oy; int ind_ox = i % ox; int gcol_ky = indexes / kx; int gcol_kx = indexes % kx; float top_gx = gcol[ind_n * c * oy * ox * ky * kx + ind_c * oy * ox * ky * kx + ind_oy * ox * ky * kx + ind_ox * ky * kx + gcol_ky * kx + gcol_kx]; gx[ind_n * c * oy * ox + ind_c * oy * ox + ind_oy * ox + ind_ox] = top_gx; ''', 'upsampling_2d_bwd')(indexes, gcol, n, c, oy, ox, ky, kx, gx) return gx,
def test_im2col_consistency(self): col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1) col_gpu = conv.im2col_gpu(to_gpu(self.x), 3, 3, 2, 2, 1, 1) assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size, ), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 3000: if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the backpropagation of " "chainer.functions.Deconvolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot(x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) gx = cuda.cupy.tensordot(col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 4000: if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v4") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(in_c, c * kh * kw) col_mats = col.reshape( n, c * kh * kw, in_h * in_w) gx_mats = gx.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gx_mats[i] = W_mat.dot(col_mats[i]) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) # filter backward gW = cuda.cupy.zeros_like(W) gW_mat = gW.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gW_mat += x_mats[i].dot(col_mats[i].T) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col Xb = _kern()(x) self.col = conv.im2col_gpu(Xb, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) Wb_mat = _kern()(W_mat) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = Wb_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def test_im2col_consistency(self): col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1) col_gpu = conv.im2col_gpu(cuda.to_gpu(self.x), 3, 3, 2, 2, 1, 1) gradient_check.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=numpy.float32) if cuda.cudnn_enabled and self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm self.max_workspace_size = out_channels * kh * kw * 4 algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, algo) workspace = cuda.cupy.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) one = numpy.array(1, dtype=x.dtype).ctypes zero = numpy.array(0, dtype=x.dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(in_c, c * kh * kw) col_mats = col.reshape( n, c * kh * kw, in_h * in_w) gx_mats = gx.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gx_mats[i] = W_mat.dot(col_mats[i]) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) # filter backward gW = cuda.cupy.zeros_like(W) gW_mat = gW.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gW_mat += x_mats[i].dot(col_mats[i].T) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if cuda.cudnn_enabled and self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) self.max_workspace_size = c * kh * kw * 4 algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo) workspace = cuda.cupy.empty((max(workspace_size // 4, 1), ), dtype=x.dtype) dtype = x.dtype one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: libcudnn.addTensor_v2(handle, libcudnn.CUDNN_ADD_SAME_C, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.autotune and _cudnn_version >= 5000: algo = get_algorithm_fwd( x, W, y, conv_param, handle, x_desc, filter_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # supports Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # NOQA oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot( col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) Wji = cuda.cupy.ascontiguousarray( W[:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() xji_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor( (0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty( (workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, xji_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, xji_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot( self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=numpy.float32) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm self.max_workspace_size = out_channels * kh * kw * 4 algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, algo) workspace = cuda.cupy.empty((max(workspace_size // 4, 1), ), dtype=numpy.float32) one = numpy.array(1, dtype=x.dtype).ctypes zero = numpy.array(0, dtype=x.dtype).ctypes libcudnn.convolutionForward(handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if len(inputs) == 3: b = inputs[2] gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward libcudnn.convolutionBackwardFilter(handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(gy, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(in_c, c * kh * kw) col_mats = col.reshape(n, c * kh * kw, in_h * in_w) gx_mats = gx.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gx_mats[i] = W_mat.dot(col_mats[i]) # bias backward if len(inputs) == 3: gb = gy.sum(axis=(0, 2, 3)) # filter backward gW = cuda.cupy.zeros_like(W) gW_mat = gW.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gW_mat += x_mats[i].dot(col_mats[i].T) if len(inputs) == 3: return gx, gW, gb else: return gx, gW
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = None if chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype: gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot( x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) if self.requires_x_grad: gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 3000: if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v3") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot( x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape if (self.cover_all or not chainer.should_use_cudnn('>=auto') or x.dtype != self.W_dtype or ((self.dy > 1 or self.dx > 1) and _cudnn_version < 6000)): col = conv.im2col_gpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) gW = cuda.cupy.tensordot( gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW, gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = get_algorithm_bwd_filter( x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports # Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 4000: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(in_c, c * kh * kw) col_mats = col.reshape( n, c * kh * kw, in_h * in_w) gx_mats = gx.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gx_mats[i] = W_mat.dot(col_mats[i]) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) # filter backward gW = cuda.cupy.zeros_like(W) gW_mat = gW.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gW_mat += x_mats[i].dot(col_mats[i].T) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) Wji = cuda.cupy.ascontiguousarray(W[:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor( (0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def forward_gpu(self, x): n, c, h, w = x[0].shape out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) out_c = self.W.shape[0] y = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) y_desc = cudnn.get_tensor_desc(y, out_h, out_w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, y_desc.value, cudnn.get_ptr(y)) # TODO(beam2d): Support unshared bias if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: # Implementation using im2col self.col = conv.im2col_gpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) y_mats = y.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=y_mats[i]) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, out_c, out_h * out_w) return y,
def _im2col(x, *args, **kwargs): if isinstance(x, numpy.ndarray): return im2col_cpu(x, *args, **kwargs) return im2col_gpu(x, *args, **kwargs)
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def forward_gpu(self, inputs): x, W = inputs[:2] out_c, _, kh, kw = W.shape n, c, h, w = x.shape b = inputs[2] if len(inputs) == 3 else None out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) self.max_workspace_size = c * kh * kw * 4 algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo) workspace = cuda.cupy.empty( (max(workspace_size // 4, 1),), dtype=x.dtype) dtype = x.dtype one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: libcudnn.addTensor( handle, libcudnn.CUDNN_ADD_SAME_C, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if ( not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype) ): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph : self.ph + h, self.pw : self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy : j * self.dy + out_h_s1, i * self.dx : i * self.dx + out_w_s1] ) Wji = cuda.cupy.ascontiguousarray(W[:, :, j : j + 1, i : i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor((0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype="b") algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size, ) oz_dtype = "d" if x.dtype == "d" else "f" one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr, ) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx ) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return (y,)
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # supports Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # NOQA oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot(col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def backward_gpu(self, x, gy): n, out_c, out_h, out_w = x[0].shape c, h, w = gy[0].shape[1:] gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gy_desc = cudnn.get_tensor_desc(gy[0], h, w) gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, gx_desc.value, cudnn.get_ptr(gx)) # bias backward if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) # filter backward libcudnn.cudnnConvolutionBackwardFilter( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) else: # Implementation using im2col col = conv.im2col_gpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = col.reshape( n, c * self.kh * self.kw, out_h * out_w) gx_mats = gx.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=gx_mats[i]) # bias backward if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * c, h * w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0) self.gb += tmp # filter backward # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) return gx,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) y = cuda.cupy.tensordot( self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def test_im2col_consistency(self): col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 2, 2, dy=2, dx=2) col_gpu = conv.im2col_gpu( cuda.to_gpu(self.x), 3, 3, 2, 2, 2, 2, dy=2, dx=2) testing.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 4000: if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v4") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot( x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) Wji = cuda.cupy.ascontiguousarray(W[:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() xji_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor( (0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, xji_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, xji_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot(self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) y = cuda.cupy.tensordot(self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,