def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) gy_arr = gy[0] if not gy_arr.flags.c_contiguous: gy_arr = cuda.cupy.ascontiguousarray(gy_arr) gy_desc = cudnn.create_tensor_descriptor(gy_arr) one = ctypes.c_float(1) zero = ctypes.c_float(0) if self.b is not None: libcudnn.convolutionBackwardBias( handle, one, gy_desc.value, gy_arr.data.ptr, one, self.bias_desc.value, self.gb.data.ptr) libcudnn.convolutionBackwardFilter( handle, one, x_desc.value, x[0].data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, one, self.filter_desc.value, self.gW.data.ptr) gx = cuda.empty_like(x[0]) libcudnn.convolutionBackwardData( handle, one, self.filter_desc.value, self.W.data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, zero, x_desc.value, gx.data.ptr) else: handle = cuda.get_cublas_handle() if self.gb is not None: self.gb += gy[0].sum(axis=(0, 2, 3)) # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w) if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) libcudnn.cudnnConvolutionBackwardFilter( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) gx = cuda.empty_like(x[0]) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 0, x_desc.value, cudnn.get_ptr(gx)) else: handle = cuda.get_cublas_handle() if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * out_c, out_h * out_w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0) self.gb += tmp # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def _partial_reduce(x): global _one out_axis, sum_axis = x.shape one = _one if one is None or one.size < sum_axis: one = cuda.ones(sum_axis) _one = one one = one[:sum_axis] handle = cuda.get_cublas_handle() ret = cuda.empty(out_axis) cuda.cublas.cublasSgemv(handle, 't', sum_axis, out_axis, numpy.float32(1.0), x.gpudata, sum_axis, one.gpudata, 1, numpy.float32(0.0), ret.gpudata, 1) return ret
def forward_gpu(self, x): n, out_c, out_h, out_w = x[0].shape c = self.W.shape[1] h = get_deconv_outsize(out_h, self.kh, self.sy, self.ph) w = get_deconv_outsize(out_w, self.kw, self.sx, self.pw) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], out_h, out_w) y = cuda.empty((n, c, h, w), dtype=numpy.float32) y_desc = cudnn.get_tensor_desc(y, h, w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), x_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 0, y_desc.value, cudnn.get_ptr(y)) if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: handle = cuda.get_cublas_handle() # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) gcol = cuda.empty((n, c, self.kh, self.kw, out_h, out_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, x_mats[i], transa='T', handle=handle, out=gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, c, h * w) return y,
def _batch_matmul_gpu(a, b, out, transa=False, transb=False, transout=False): if transout: # (A B)^T = B^T A^T a, b, transa, transb = b, a, not transb, not transa a = _as_batch_mat(a) b = _as_batch_mat(b) alpha = numpy.float32(1.0) beta = numpy.float32(0.0) l, m, k = a.shape if transa: m, k = k, m n = b.shape[1] if transb else b.shape[2] return cuda.cublas.cublasSgemmBatched( cuda.get_cublas_handle(), _as_trans_op(transb), _as_trans_op(transa), n, m, k, alpha, _mat_ptrs(b).gpudata, k if transb else n, _mat_ptrs(a).gpudata, m if transa else k, beta, _mat_ptrs(out).gpudata, n, l)
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) gy_arr = gy[0] if not gy_arr.flags.c_contiguous: gy_arr = cuda.cupy.ascontiguousarray(gy_arr) gy_desc = cudnn.create_tensor_descriptor(gy_arr) one = ctypes.c_float(1) zero = ctypes.c_float(0) if self.b is not None: libcudnn.convolutionBackwardBias(handle, one, gy_desc.value, gy_arr.data.ptr, one, self.bias_desc.value, self.gb.data.ptr) libcudnn.convolutionBackwardFilter(handle, one, x_desc.value, x[0].data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, one, self.filter_desc.value, self.gW.data.ptr) gx = cuda.empty_like(x[0]) libcudnn.convolutionBackwardData(handle, one, self.filter_desc.value, self.W.data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, zero, x_desc.value, gx.data.ptr) else: handle = cuda.get_cublas_handle() if self.gb is not None: self.gb += gy[0].sum(axis=(0, 2, 3)) # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape(n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot(gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, x, gy): n, out_c, out_h, out_w = x[0].shape c, h, w = gy[0].shape[1:] gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gy_desc = cudnn.get_tensor_desc(gy[0], h, w) gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, gx_desc.value, cudnn.get_ptr(gx)) # bias backward if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) # filter backward libcudnn.cudnnConvolutionBackwardFilter( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) else: # Implementation using im2col col = conv.im2col_gpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = col.reshape( n, c * self.kh * self.kw, out_h * out_w) gx_mats = gx.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=gx_mats[i]) # bias backward if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * c, h * w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0) self.gb += tmp # filter backward # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) return gx,
def forward_gpu(self, x): n, c, h, w = x[0].shape out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) out_c = self.W.shape[0] y = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) y_desc = cudnn.get_tensor_desc(y, out_h, out_w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, y_desc.value, cudnn.get_ptr(y)) # TODO(beam2d): Support unshared bias if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: # Implementation using im2col self.col = conv.im2col_gpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) y_mats = y.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=y_mats[i]) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, out_c, out_h * out_w) return y,
import numpy as np from chainer import cuda try: handle = cuda.get_cublas_handle() except: pass def dot(A, B, out, transa='n', transb='n', alpha=np.float32(1.0)): """ This is just the blas-routine Sgemm: out = alpha*A.dot(B) where default alpha is 1 and default beta is 0 """ beta=np.float(0.0) if transa == 't': l, n = A.shape else: n, l = A.shape if transb == 't': m, k = B.shape else: k, m = B.shape if l != k: raise ValueError('objects are not aligned') if out.shape != (n, m) or out.dtype != A.dtype: raise ValueError('invalid value for c_gpu') cuda.cublas.cublasSgemm(handle, transb, transa, m, n, k, alpha, B.gpudata, np.int32(B.shape[1]), A.gpudata, np.int32(A.shape[1]), beta, out.gpudata, np.int32(out.shape[1]))