def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape if (self.cover_all or not chainer.should_use_cudnn('>=auto') or x.dtype != self.W_dtype): col = conv.im2col_gpu(x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW, gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype conv_desc = cudnn.create_convolution_descriptor(*conv_param) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = _get_algorithm_bwd_filter(x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def _forward_cudnn(self, x, W, b): c = W.shape[1] # W: C_I, C_O, k_1, k_2, ..., k_N ksize = W.shape[2:] n, in_c = x.shape[:2] # x: n, C_I, d_1, d_2, ..., d_N dims = x.shape[2:] ndim = self.ndim colon = slice(None) # Make empty array for output. if self.outs is None: self.outs = tuple( conv.get_deconv_outsize(d, k, s, p) for d, k, s, p in zip(dims, ksize, self.stride, self.pad)) assert all(out > 0 for out in self.outs), \ 'Output sizes should be positive.' y_shape = (n, c) + self.outs # (n, c_O, out_1, out_2, ..., out_N) y = cuda.cupy.empty(y_shape, dtype=x.dtype) # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( self.pad, self.stride, x.dtype) if b is not None: b_index = (None, colon) + (None,) * ndim self.bias_desc = cudnn.create_tensor_descriptor(b[b_index]) # cuDNN forward computation. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # Add bias if given. # TODO(takagi) Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def _backward_cudnn(self, x, W, b, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) # Make empty arrays for result. gx = cuda.cupy.empty_like(x) gW = cuda.cupy.empty_like(W) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) # Compute gradients. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') # Compute filter weight gradient. if configuration.config.autotune and _cudnn_version_ >= 5000: algo = convolution_2d.get_algorithm_bwd_filter( x, gy, gW, self.conv_param, handle, x_desc, gy_desc, self.conv_desc, self.filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) # Compute input gradient. algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) # Compute bias gradient if given and return gradients. if b is None: return gx, gW else: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) return gx, gW, gb
def _backward_cudnn(self, x, W, b, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Make empty arrays for results. gx = cuda.cupy.empty_like(x) gW = cuda.cupy.empty_like(W) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # Chance to choose implicit-precom-gemm algorithm. workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') # Compute input gradient. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # Compute bias gradient. if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) # Compute filter gradient. algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if b is None: return gx, gW else: return gx, gW, gb
def _forward_cudnn(self, x, W, b): c = W.shape[1] # W: C_I, C_O, k_1, k_2, ..., k_N ksize = W.shape[2:] n, in_c = x.shape[:2] # x: n, C_I, d_1, d_2, ..., d_N dims = x.shape[2:] ndim = self.ndim colon = slice(None) # Make empty array for output. if self.outs is None: self.outs = tuple( conv.get_deconv_outsize(d, k, s, p) for d, k, s, p in zip(dims, ksize, self.stride, self.pad)) assert all(out > 0 for out in self.outs), \ 'Output sizes should be positive.' y_shape = (n, c) + self.outs # (n, c_O, out_1, out_2, ..., out_N) y = cuda.cupy.empty(y_shape, dtype=x.dtype) # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( self.pad, self.stride, x.dtype) if b is not None: b_index = (None, colon) + (None, ) * ndim self.bias_desc = cudnn.create_tensor_descriptor(b[b_index]) # cuDNN forward computation. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # Add bias if given. # TODO(takagi) Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def _backward_cudnn(self, x, W, b, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Make empty arrays for results. gx = cuda.cupy.empty_like(x) gW = cuda.cupy.empty_like(W) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # Chance to choose implicit-precom-gemm algorithm. workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size, ), dtype='b') # Compute input gradient. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # Compute bias gradient. if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) # Compute filter gradient. algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if b is None: return gx, gW else: return gx, gW, gb
def test_size(self): size = 1024 cuda.set_max_workspace_size(size) self.assertEqual(size, cuda.get_max_workspace_size())
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) # TODO(beam2d): Use streams or batch gemm gW_mat[...] = 0 for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = W.reshape(out_c, -1) Wb_mat = _kern()(W_mat) gcol = cuda.cupy.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(Wb_mat.T, gy_mats[i]) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # only retain x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph, d=self.dy) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw, d=self.dx) assert self.outw > 0, 'Width in the output should be positive.' self._set_cover_all(x, W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, filter_desc.value, x_desc.value, conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports # Tensor-Core in cuDNN7 algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 libcudnn.convolutionBackwardData_v3( handle, one.data, filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw, dy=self.dy, dx=self.dx) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def _forward_cudnn(self, x, W, b): out_c = W.shape[0] # (c_O, _, k_1, k_2, ..., k_N) ksize = W.shape[2:] n, c = x.shape[:2] # (n, c_I, d_1, d_2, ..., d_N) dims = x.shape[2:] stride = self.stride pad = self.pad ndim = self.ndim colon = slice(None) # Make empty array for result. outs = tuple( conv.get_conv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in zip(dims, ksize, stride, pad)) assert all(out > 0 for out in outs), 'Output sizes should be positive.' y_shape = (n, out_c) + outs # (n, c_O, out_1, out_2, ..., out_N) y = cuda.cupy.empty(y_shape, dtype=x.dtype) # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( pad, stride, x.dtype) if b is not None: b_index = (None, colon) + (None,) * ndim self.bias_desc = cudnn.create_tensor_descriptor(b[b_index]) # Find cuDNN algorithm to be used. workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) # cuDNN forward computation. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # Add bias if given. # TODO(takagi) Support unshared bias if b is not None: if _cudnn_version >= 3000 or ndim == 2: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # cuDNN v2 does not seem to support bias addition in spatial # dimensions other than two. b_index = (None, colon) + (None,) * ndim y += b[b_index] return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) # TODO(beam2d): Use streams or batch gemm gW_mat[...] = 0 for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = W.reshape(out_c, -1) gcol = cuda.cupy.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(W_mat.T, gy_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 4000: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(in_c, c * kh * kw) col_mats = col.reshape( n, c * kh * kw, in_h * in_w) gx_mats = gx.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gx_mats[i] = W_mat.dot(col_mats[i]) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) # filter backward gW = cuda.cupy.zeros_like(W) gW_mat = gW.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gW_mat += x_mats[i].dot(col_mats[i].T) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] if self.bcoeffs is not None: olen, ilen, hlen, wlen = W.shape if self.coeffs is None: self.coeffs = numpy.ones(ilen) coeffs = numpy.copy(self.bcoeffs) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 0) coeffs = numpy.broadcast_to(coeffs, W.shape) self.mW = cuda.cupy.asarray(coeffs,numpy.float32).reshape(W.shape) if self.ocoeffs is not None: coeffs = numpy.copy(self.ocoeffs) self.mb = cuda.cupy.asarray(coeffs,numpy.float32) W = self.M*W b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) gx = None if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the backpropagation of " "chainer.functions.Convolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if self.requires_x_grad: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) # gW = self.mW * gW if hasattr(self,'mW'): gW = self.mW * gW if hasattr(self,'mb'): xp = cuda.get_array_module(*x) gW = xp.broadcast_to( xp.expand_dims(xp.expand_dims(xp.expand_dims(self.mb,1),1),1) ,gW.shape) * gW if b is None: return gx, gW else: if hasattr(self,'mb'): gb = self.mb * gb return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros( (n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 out_sh = out_h + (out_h - 1) * (self.sy - 1) out_sw = out_w + (out_w - 1) * (self.sx - 1) gy_ph = (h + dkh - out_sh - 1) / 2 gy_pw = (w + dkw - out_sw - 1) / 2 pad_gy = cuda.cupy.zeros( (n, out_c, h + dkh - 1, w + dkw - 1), dtype=x.dtype) pad_gy[:, :, gy_ph:gy_ph + out_sh:self.sy, gy_pw:gy_pw + out_sw:self.sx] = gy for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) gyji = cuda.cupy.ascontiguousarray( pad_gy[:, :, j * self.dy:j * self.dy + h, i * self.dx:i * self.dx + w]) Wji = cuda.cupy.ascontiguousarray( W[:, :, -1::-1, -1::-1][:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) xji_desc = cudnn.create_tensor_descriptor(xji) gy_desc = cudnn.create_tensor_descriptor(gy) gyji_desc = cudnn.create_tensor_descriptor(gyji) conv_desc_data = cudnn.create_convolution_descriptor( (0, 0), (1, 1), xji.dtype) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.zeros_like(x) gWji = cuda.cupy.empty((out_c, c, 1, 1), dtype=W.dtype) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty( (workspace_size,), dtype='b') algo_filter = ( libcudnn.getConvolutionBackwardFilterAlgorithm( handle, xji_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size)) algo_data = ( libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gyji_desc.value, conv_desc_data.value, x_desc.value, _bwd_data_pref, workspace_size)) if _cudnn_version >= 4000: libcudnn.convolutionBackwardFilter_v3( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo_filter, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gWji.data.ptr) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, algo_data, workspace.data.ptr, workspace_size, one.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gWji.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, one.data, x_desc.value, gx.data.ptr) gW[:, :, j:j + 1, i:i + 1] = gWji if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w, dy=self.dy, dx=self.dx) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) gx = None if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if self.requires_x_grad: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def _backward_cudnn(self, x, W, b, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) # Make empty arrays for result. gx = cuda.cupy.empty_like(x) gW = cuda.cupy.empty_like(W) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) # Compute gradients. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') # Compute filter weight gradient. algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) # Compute input gradient. algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: # Compute input and filter weight gradients. libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) # Compute bias gradient if given and return gradients. if b is None: return gx, gW elif _cudnn_version >= 3000 or self.ndim == 2: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) return gx, gW, gb else: # cuDNN v2 does not seem to support bias backward in spatial # dimensions other than two. # (n, _, out_1, out_2, ..., out_N) axis = (0, ) + tuple(moves.range(2, self.ndim + 2)) gb = gy.sum(axis=axis) return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) if cuda.cudnn_enabled and self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=numpy.float32) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) one = numpy.array(1, dtype=x.dtype).ctypes zero = numpy.array(0, dtype=x.dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty( (n, c, kh, kw, in_h, in_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, x_mats[i], gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size, ), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 3000: if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the backpropagation of " "chainer.functions.Deconvolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot(x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) gx = cuda.cupy.tensordot(col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the forward propagation of " "chainer.functions.Deconvolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def setUp(self): self.space = cuda.get_max_workspace_size()
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) Wji = cuda.cupy.ascontiguousarray(W[:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() xji_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor( (0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, xji_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, xji_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot(self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = None if chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype: gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size, ), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot(x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) if self.requires_x_grad: gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) gx = None if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the backpropagation of " "chainer.functions.Convolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if self.requires_x_grad: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) y = cuda.cupy.tensordot( self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def _forward_cudnn(self, x, W, b): out_c = W.shape[0] # (c_O, _, k_1, k_2, ..., k_N) ksize = W.shape[2:] n, c = x.shape[:2] # (n, c_I, d_1, d_2, ..., d_N) dims = x.shape[2:] stride = self.stride pad = self.pad ndim = self.ndim colon = slice(None) # Make empty array for result. outs = tuple( conv.get_conv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in zip(dims, ksize, stride, pad)) assert all(out > 0 for out in outs), 'Output sizes should be positive.' y_shape = (n, out_c) + outs # (n, c_O, out_1, out_2, ..., out_N) y = cuda.cupy.empty(y_shape, dtype=x.dtype) # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_param = (pad, stride, x.dtype) self.conv_desc = cudnn.create_convolution_descriptor(*self.conv_param) if b is not None: b_index = (None, colon) + (None,) * ndim self.bias_desc = cudnn.create_tensor_descriptor(b[b_index]) # Find cuDNN algorithm to be used. workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.autotune and _cudnn_version_ >= 5000: algo = convolution_2d.get_algorithm_fwd( x, W, y, self.conv_param, handle, x_desc, self.filter_desc, self.conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) # cuDNN forward computation. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # Add bias if given. # TODO(takagi) Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.autotune and _cudnn_version >= 5000: algo = get_algorithm_fwd( x, W, y, conv_param, handle, x_desc, filter_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # supports Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # NOQA oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot( col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) Wji = cuda.cupy.ascontiguousarray( W[:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() xji_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor( (0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty( (workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, xji_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, xji_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot( self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape if (self.cover_all or not chainer.should_use_cudnn('>=auto') or x.dtype != self.W_dtype or ((self.dy > 1 or self.dx > 1) and _cudnn_version < 6000)): col = conv.im2col_gpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) gW = cuda.cupy.tensordot( gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW, gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = get_algorithm_bwd_filter( x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports # Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if ( not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype) ): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph : self.ph + h, self.pw : self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy : j * self.dy + out_h_s1, i * self.dx : i * self.dx + out_w_s1] ) Wji = cuda.cupy.ascontiguousarray(W[:, :, j : j + 1, i : i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor((0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype="b") algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size, ) oz_dtype = "d" if x.dtype == "d" else "f" one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr, ) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx ) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return (y,)
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty( (n, c, kh, kw, in_h, in_w), dtype=x.dtype) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(W_mat.T, x_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 4000: if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v4") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot( x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # supports Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # NOQA oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot(col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 4000: if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v4") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) W_mat = W.reshape(in_c, c * kh * kw) col_mats = col.reshape( n, c * kh * kw, in_h * in_w) gx_mats = gx.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gx_mats[i] = W_mat.dot(col_mats[i]) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) # filter backward gW = cuda.cupy.zeros_like(W) gW_mat = gW.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) for i in moves.range(n): gW_mat += x_mats[i].dot(col_mats[i].T) if b is None: return gx, gW else: return gx, gW, gb
def _backward_cudnn(self, x, W, b, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Make empty arrays for results. gx = cuda.cupy.empty_like(x) gW = cuda.cupy.empty_like(W) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # Chance to choose implicit-precom-gemm algorithm. workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') # Compute input gradient. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # Compute bias gradient. if b is not None: if _cudnn_version >= 3000 or self.ndim == 2: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: # cuDNN v2 does not seem to support bias backward in spatial # dimensions other than two. # (n, _, out_1, out_2, ..., out_N) axis = (0,) + tuple(six.moves.range(2, self.ndim + 2)) gb = gy.sum(axis=axis) # Compute filter gradient. if _cudnn_version >= 4000: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty( (n, c, kh, kw, in_h, in_w), dtype=x.dtype) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(W_mat.T, x_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col Xb = _kern()(x) self.col = conv.im2col_gpu(Xb, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) Wb_mat = _kern()(W_mat) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = Wb_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None out_c, _, kh, kw = W.shape n, c, h, w = x.shape dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) Wji = cuda.cupy.ascontiguousarray(W[:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(xji) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(Wji) self.conv_desc = cudnn.create_convolution_descriptor( (0, 0), (self.sy, self.sx), xji.dtype) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, xji.data.ptr, self.filter_desc.value, Wji.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, one.data, y_desc.value, y.data.ptr) if b is not None: b = cuda.cupy.ascontiguousarray(b) self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) W_mat = W.reshape(out_c, -1) col_mats = self.col.reshape(n, -1, out_h * out_w) y_mats = y.reshape(n, out_c, -1) # TODO(beam2d): Use streams or batch gemm for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if b is not None: y += b[:, None, None] return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if _cudnn_version >= 3000: if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v3") libcudnn.convolutionBackwardFilter_v2( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot( x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] n, in_c, in_h, in_w = x.shape _, out_channels, kh, kw = W.shape c, h, w = gy.shape[1:] gx = None if chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype: gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype) x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() gy_desc = cudnn.create_tensor_descriptor(gy) gx_desc = cudnn.create_tensor_descriptor(gx) # chance to choose implicit-precomp-gemm algorithm workspace_size = cuda.get_max_workspace_size() algo = libcudnn.getConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size) workspace = cuda.cupy.empty((workspace_size,), dtype='b') oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, gy_desc.value, gy.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, gx_desc.value, gx.data.ptr) # bias backward if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) gW = cuda.cupy.empty_like(W) # filter backward if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, gy_desc.value, gx_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, gy_desc.value, gy.data.ptr, gx_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( gy, kh, kw, self.sy, self.sx, self.ph, self.pw) gW = cuda.cupy.tensordot( x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False) if self.requires_x_grad: gx = cuda.cupy.tensordot( col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False) gx = cuda.cupy.rollaxis(gx, 3, 1) # bias backward if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v3") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) gW = cuda.cupy.empty_like(W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 out_sh = out_h + (out_h - 1) * (self.sy - 1) out_sw = out_w + (out_w - 1) * (self.sx - 1) gy_ph = (h + dkh - out_sh - 1) / 2 gy_pw = (w + dkw - out_sw - 1) / 2 pad_gy = cuda.cupy.zeros((n, out_c, h + dkh - 1, w + dkw - 1), dtype=x.dtype) pad_gy[:, :, gy_ph:gy_ph + out_sh:self.sy, gy_pw:gy_pw + out_sw:self.sx] = gy gx = None for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) gyji = cuda.cupy.ascontiguousarray( pad_gy[:, :, j * self.dy:j * self.dy + h, i * self.dx:i * self.dx + w]) Wji = cuda.cupy.ascontiguousarray(W[:, :, -1::-1, -1::-1][:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) xji_desc = cudnn.create_tensor_descriptor(xji) gy_desc = cudnn.create_tensor_descriptor(gy) gyji_desc = cudnn.create_tensor_descriptor(gyji) conv_desc_data = cudnn.create_convolution_descriptor( (0, 0), (1, 1), xji.dtype) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if self.requires_x_grad: gx = cuda.cupy.zeros_like(x) gWji = cuda.cupy.empty((out_c, c, 1, 1), dtype=W.dtype) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo_filter = ( libcudnn.getConvolutionBackwardFilterAlgorithm( handle, xji_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size)) algo_data = ( libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gyji_desc.value, conv_desc_data.value, x_desc.value, _bwd_data_pref, workspace_size)) if _cudnn_version >= 4000: libcudnn.convolutionBackwardFilter_v3( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo_filter, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gWji.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gWji.data.ptr) if self.requires_x_grad: if _cudnn_version >= 4000: libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, algo_data, workspace.data.ptr, workspace_size, one.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, one.data, x_desc.value, gx.data.ptr) gW[:, :, j:j + 1, i:i + 1] = gWji if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if not self.requires_x_grad: gx = None else: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w, dy=self.dy, dx=self.dx) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) y = cuda.cupy.tensordot(self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def _forward_cudnn(self, x, W, b): out_c = W.shape[0] # (c_O, _, k_1, k_2, ..., k_N) ksize = W.shape[2:] n, c = x.shape[:2] # (n, c_I, d_1, d_2, ..., d_N) dims = x.shape[2:] stride = self.stride pad = self.pad ndim = self.ndim colon = slice(None) # Make empty array for result. outs = tuple( conv.get_conv_outsize(d, k, s, p, cover_all=self.cover_all) for (d, k, s, p) in zip(dims, ksize, stride, pad)) y_shape = (n, out_c) + outs # (n, c_O, out_1, out_2, ..., out_N) y = cuda.cupy.empty(y_shape, dtype=x.dtype) # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) # Get cuDNN handler and descriptors. handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( pad, stride, x.dtype) if b is not None: b_index = (None, colon) + (None, ) * ndim self.bias_desc = cudnn.create_tensor_descriptor(b[b_index]) # Find cuDNN algorithm to be used. workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm(handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size) # cuDNN forward computation. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, self.filter_desc.value, W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # Add bias if given. # TODO(takagi) Support unshared bias if b is not None: if _cudnn_version >= 3000 or ndim == 2: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # cuDNN v2 does not seem to support bias addition in spatial # dimensions other than two. b_index = (None, colon) + (None, ) * ndim y += b[b_index] return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v4") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb