def forward_gpu(self, inputs): x, targets = inputs N = x.shape[0] #Linear function z = cuda.empty((N,self.no_labels), dtype=np.float32) cuk.dot(x, self.W, out=z, transb='t') if not self.nobias: cuk.addVec2Mat(z, self.b) self.probs = z if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(z, 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(z), 0, desc.value, cudnn.get_ptr(self.probs)) else: cuk.softmax(z, self.probs) if self.return_probs: return self.probs, if self.compute_loss: correct_probs = cuda.empty((N,),dtype=np.float32) cuk.getByIndex_LogAndClip( self.probs, targets, out=correct_probs) loss = -cuda.cumisc.sum(correct_probs, keepdims=True)/N else: loss = np.atleast_2d(np.array(np.nan,dtype=np.float32)) return loss,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr( self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward(handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0], ), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def forward_gpu(self, x): self.y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnActivationForward( handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(self.y) ) else: cuda.elementwise("float* y, const float* x", "y[i] = tanhf(x[i])", "tanh_fwd")(self.y, x[0]) return (self.y,)
def forward_gpu(self, x): self.y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnActivationForward(handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(self.y)) else: cuda.elementwise('float* y, const float* x', 'y[i] = tanhf(x[i])', 'tanh_fwd')(self.y, x[0]) return self.y,
def forward_gpu(self, x): self.y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnActivationForward( handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(self.y)) else: cuda.elementwise( 'float* y, const float* x', 'y[i] = 1 / (1 + __expf(-x[i]))', 'sigmoid_fwd')(self.y, x[0]) return self.y,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnActivationForward( handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: cuda.elementwise('float* y, const float* x', 'y[i] = max(0.f, x[i])', 'relu_fwd')(y, x[0]) return y,
def backward_gpu(self, x, gy): # Implementation using cudnn handle = cudnn.get_default_handle() pool_desc = self.create_pool_desc() x_desc = cudnn.get_tensor_desc( x[0], x[0].shape[2], x[0].shape[3]) y_desc = cudnn.get_tensor_desc(gy[0], gy[0].shape[2], gy[0].shape[3]) gx = cuda.empty_like(x[0]) libcudnn.cudnnPoolingBackward( handle, pool_desc.value, 1, y_desc.value, cudnn.get_ptr(self.y), y_desc.value, cudnn.get_ptr(gy[0]), x_desc.value, cudnn.get_ptr(x[0]), 0, x_desc.value, cudnn.get_ptr(gx)) return gx,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(self.y, 1, 1) libcudnn.cudnnActivationBackward(handle, _mode, 1, desc.value, cudnn.get_ptr(self.y), desc.value, cudnn.get_ptr(gy[0]), desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(gx)) else: cuda.elementwise('float* gx, const float* y, const float* gy', 'gx[i] = gy[i] * y[i] * (1 - y[i])', 'sigmoid_bwd')(gx, self.y, gy[0]) return gx,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(self.y, 1, 1) libcudnn.cudnnActivationBackward( handle, _mode, 1, desc.value, cudnn.get_ptr(self.y), desc.value, cudnn.get_ptr(gy[0]), desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(gx)) else: cuda.elementwise( 'float* gx, const float* y, const float* gy', 'gx[i] = gy[i] * y[i] * (1 - y[i])', 'sigmoid_bwd')(gx, self.y, gy[0]) return gx,
def forward_gpu(self, x): n, out_c, out_h, out_w = x[0].shape c = self.W.shape[1] h = get_deconv_outsize(out_h, self.kh, self.sy, self.ph) w = get_deconv_outsize(out_w, self.kw, self.sx, self.pw) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], out_h, out_w) y = cuda.empty((n, c, h, w), dtype=numpy.float32) y_desc = cudnn.get_tensor_desc(y, h, w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), x_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 0, y_desc.value, cudnn.get_ptr(y)) if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: handle = cuda.get_cublas_handle() # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) gcol = cuda.empty((n, c, self.kh, self.kw, out_h, out_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, x_mats[i], transa='T', handle=handle, out=gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, c, h * w) return y,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32) c = x[0].shape[1] cuda.elementwise( 'float* maxes, const float* x, int c', ''' const float* row = x + i * c; float maxval = row[0]; for (int j = 1; j < c; ++j) { if (maxval < row[j]) { maxval = row[j]; } } maxes[i] = maxval; ''', 'softmax_rowmax')(maxes, x[0], c) cuda.elementwise( 'float* y, const float* x, const float* maxes, int c', 'y[i] = __expf(x[i] - maxes[i / c])', 'softmax_exp')(y, x[0], maxes, c) coeff = maxes # reuse memory cuda.elementwise( 'float* coeff, const float* y, int c', ''' const float* row = y + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } coeff[i] = 1 / sum; ''', 'softmax_invrowsum')(coeff, y, c) cuda.elementwise( 'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]', 'softmax_rowmul')(y, coeff, c) self.y = y return y,
def forward_gpu(self, x): # Implementation using cudnn n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) handle = cudnn.get_default_handle() pool_desc = self.create_pool_desc() x_desc = cudnn.get_tensor_desc(x[0], x[0].shape[2], x[0].shape[3]) y_desc = cudnn.get_tensor_desc(y, y_h, y_w) libcudnn.cudnnPoolingForward( handle, pool_desc.value, 1, x_desc.value, cudnn.get_ptr(x[0]), 0, y_desc.value, cudnn.get_ptr(y)) self.y = y return y,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w) if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) libcudnn.cudnnConvolutionBackwardFilter( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) gx = cuda.empty_like(x[0]) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 0, x_desc.value, cudnn.get_ptr(gx)) else: handle = cuda.get_cublas_handle() if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * out_c, out_h * out_w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0) self.gb += tmp # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, x, gy): n_unit = int(numpy.prod(x[0].shape[2:])) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], n_unit, 1) libcudnn.cudnnSoftmaxBackward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr( self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy_shape = (gx.shape[0],) + gx.shape[2:] sum_ydy = cuda.empty(sum_ydy_shape, dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int n_channel, int n_unit', ''' const int n = i / n_unit; const int m = i % n_unit; const float* row = ydy + n * n_channel * n_unit + m; float sum = 0; for (int c = 0; c < n_channel; ++c) { sum += row[c * n_unit]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c, n_unit) cuda.elementwise( ''' float* gx, const float* y, const float* sum_ydy, int n_channel, int n_unit ''', ''' const int n = i / (n_channel * n_unit); const int m = i % n_unit; gx[i] -= y[i] * sum_ydy[n * n_unit + m]; ''', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c, n_unit) return gx,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(self.y, 1, 1) libcudnn.cudnnActivationBackward( handle, _mode, 1, desc.value, cudnn.get_ptr(self.y), desc.value, cudnn.get_ptr(gy[0]), desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(gx), ) else: cuda.elementwise("float* gx, const float* x, const float* gy", "gx[i] = x[i] > 0 ? gy[i] : 0", "relu_bwd")( gx, x[0], gy[0] ) return (gx,)
def backward_gpu(self, x, gy): n, out_c, out_h, out_w = x[0].shape c, h, w = gy[0].shape[1:] gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gy_desc = cudnn.get_tensor_desc(gy[0], h, w) gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, gx_desc.value, cudnn.get_ptr(gx)) # bias backward if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) # filter backward libcudnn.cudnnConvolutionBackwardFilter( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) else: # Implementation using im2col col = conv.im2col_gpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = col.reshape( n, c * self.kh * self.kw, out_h * out_w) gx_mats = gx.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=gx_mats[i]) # bias backward if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * c, h * w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0) self.gb += tmp # filter backward # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) return gx,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) n_unit = int(numpy.prod(x[0].shape[2:])) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], n_unit, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: maxes_shape = (x[0].shape[0],) + x[0].shape[2:] maxes = cuda.empty(maxes_shape, dtype=numpy.float32) c = x[0].shape[1] cuda.elementwise( 'float* maxes, const float* x, int n_channel, int n_unit', ''' const int n = i / n_unit; const int m = i % n_unit; const float* row = x + n * n_channel * n_unit + m; float maxval = row[0]; for (int c = 1; c < n_channel; ++c) { const int v = c * n_unit; if (maxval < row[v]) { maxval = row[v]; } } maxes[i] = maxval; ''', 'softmax_rowmax')(maxes, x[0], c, n_unit) cuda.elementwise( ''' float* y, const float* x, const float* maxes, int n_channel, int n_unit ''', ''' const int n = i / (n_channel * n_unit); const int m = i % n_unit; y[i] = __expf(x[i] - maxes[n * n_unit + m]); ''', 'softmax_exp')(y, x[0], maxes, c, n_unit) coeff = maxes # reuse memory cuda.elementwise( 'float* coeff, const float* y, int n_channel, int n_unit', ''' const int n = i / n_unit; const int m = i % n_unit; const float* row = y + n * n_channel * n_unit + m; float sum = 0; for (int c = 0; c < n_channel; ++c) { sum += row[c * n_unit]; } coeff[i] = 1 / sum; ''', 'softmax_invrowsum')(coeff, y, c, n_unit) cuda.elementwise( 'float* y, const float* coeff, int n_channel, int n_unit', ''' const int n = i / (n_channel * n_unit); const int m = i % n_unit; y[i] *= coeff[n * n_unit + m]; ''', 'softmax_rowmul')(y, coeff, c, n_unit) self.y = y return y,
def forward_gpu(self, x): n, c, h, w = x[0].shape out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) out_c = self.W.shape[0] y = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) y_desc = cudnn.get_tensor_desc(y, out_h, out_w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, y_desc.value, cudnn.get_ptr(y)) # TODO(beam2d): Support unshared bias if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: # Implementation using im2col self.col = conv.im2col_gpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) y_mats = y.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=y_mats[i]) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, out_c, out_h * out_w) return y,