def forward_gpu(self, inputs): x, h_tm1 = inputs N = x.shape[0] #update gate u = cuda.empty((N,self.out_size),dtype=np.float32) cuk.dot(x, self.Wu, out=u, transb = 't') cuk.dotAdd(h_tm1, self.Vu, C=u, transb='t') #reset gate r = cuda.empty((N,self.out_size),dtype=np.float32) cuk.dot(x, self.Wr, out=r, transb = 't') cuk.dotAdd(h_tm1, self.Vr, C=r, transb='t') if not self.nobias: cuk.addVec2Mat(u, self.bu) cuk.addVec2Mat(r, self.br) self.u = cuk.sigmoid(x=u, out=u) self.r = cuk.sigmoid(x=r, out=r) #new memory HV = cuda.empty((N,self.out_size),dtype=np.float32) self.HV = cuk.dot(h_tm1, self.Vh, out=HV, transb='t') h_tilde = cuda.empty((N,self.out_size),dtype=np.float32) h_tilde = cuk.hadamard(r, self.HV, out=h_tilde) cuk.dotAdd(x, self.Wh, C=h_tilde, transb='t') if not self.nobias: cuk.addVec2Mat(h_tilde, self.bh) self.h_tilde = cuk.tanh(x=h_tilde, out=h_tilde) #hidden state h = cuda.empty((N,self.out_size),dtype=np.float32) self.h = cuk.gru_forward(u=u, h_tilde=h_tilde, h_tm1=h_tm1, out=h) return self.h,
def forward_gpu(self, inputs): x, targets = inputs N = x.shape[0] #Linear function z = cuda.empty((N,self.no_labels), dtype=np.float32) cuk.dot(x, self.W, out=z, transb='t') if not self.nobias: cuk.addVec2Mat(z, self.b) self.probs = z if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(z, 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(z), 0, desc.value, cudnn.get_ptr(self.probs)) else: cuk.softmax(z, self.probs) if self.return_probs: return self.probs, if self.compute_loss: correct_probs = cuda.empty((N,),dtype=np.float32) cuk.getByIndex_LogAndClip( self.probs, targets, out=correct_probs) loss = -cuda.cumisc.sum(correct_probs, keepdims=True)/N else: loss = np.atleast_2d(np.array(np.nan,dtype=np.float32)) return loss,
def forward_gpu(self, x): n, c, h, w = x[0].shape out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) out_c = self.W.shape[0] y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(self.W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.create_tensor_descriptor( self.b[None, :, None, None]) algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo) workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=self.dtype) one = ctypes.c_float(1) zero = ctypes.c_float(0) libcudnn.convolutionForward( handle, one, x_desc.value, x[0].data.ptr, self.filter_desc.value, self.W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if self.b is not None: libcudnn.addTensor( handle, libcudnn.CUDNN_ADD_SAME_C, one, self.bias_desc.value, self.b.data.ptr, one, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) y_mats = y.reshape(n, out_c, out_h * out_w) for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if self.b is not None: y += self.b.reshape((1, out_c, 1, 1)) return y,
def forward_gpu(self, x): n, c, h, w = x[0].shape out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) out_c = self.W.shape[0] y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(self.W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.create_tensor_descriptor( self.b[None, :, None, None]) algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.getConvolutionForwardWorkspaceSize( handle, x_desc.value, self.filter_desc.value, self.conv_desc.value, y_desc.value, algo) workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=self.dtype) one = ctypes.c_float(1) zero = ctypes.c_float(0) libcudnn.convolutionForward( handle, one, x_desc.value, x[0].data.ptr, self.filter_desc.value, self.W.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if self.b is not None: libcudnn.addTensor( handle, libcudnn.CUDNN_ADD_SAME_C, one, self.bias_desc.value, self.b.data.ptr, one, y_desc.value, y.data.ptr) else: # Implementation using im2col self.col = conv.im2col_gpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) y_mats = y.reshape(n, out_c, out_h * out_w) for i in moves.range(n): y_mats[i] = W_mat.dot(col_mats[i]) # TODO(beam2d): Support unshared bias if self.b is not None: y += self.b.reshape((1, out_c, 1, 1)) return y,
def backward_gpu(self, inputs, grad_outputs): y, z = inputs N = y.shape[0] gy = cuda.empty(y.shape) gz = cuda.empty(z.shape) cuda.culinalg.add_dot(self.z_centered, self.covariance, gy, transb="T", alpha=1.0 / N, beta=0.0) cuda.culinalg.add_dot(self.y_centered, self.covariance, gz, alpha=1.0 / N, beta=0.0) return gy, gz
def backward_gpu(self, x, gy): a, b = x batch_size = a.shape[0] ga = cuda.empty((batch_size,) + _as_mat(a[0]).shape) gb = cuda.empty((batch_size,) + _as_mat(b[0]).shape) _batch_matmul_gpu( gy[0], b, transb=not self.transb, transout=self.transa, out=ga) _batch_matmul_gpu( a, gy[0], transa=not self.transa, transout=self.transb, out=gb) ga = ga.reshape(a.shape) gb = gb.reshape(b.shape) return ga, gb
def backward_gpu(self, inputs, grad_outputs): y, z = inputs gcost, = grad_outputs N = y.shape[0] gy = cuda.empty(y.shape) gz = cuda.empty(z.shape) cuda.culinalg.add_dot(self.z_centered, self.covariance, gy, transb='T', alpha=1./N, beta=0.) cuda.culinalg.add_dot(self.y_centered, self.covariance, gz, alpha=1./N, beta=0.) gy = cuda.cumisc.multiply(gy, gcost) gz = cuda.cumisc.multiply(gz, gcost) return gy, gz
def forward_gpu(self, x): if cuda.cudnn_enabled and self.use_cudnn: return super(MaxPooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize( h, self.kh, self.sy, self.ph, self.cover_all) y_w = conv.get_conv_outsize( w, self.kw, self.sx, self.pw, self.cover_all) y = cuda.empty((n, c, y_h, y_w), dtype=x[0].dtype) self.indexes = cuda.empty((n, c, y_h, y_w), dtype=numpy.int32) cuda.elementwise( 'raw T in, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T out, S indexes', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float maxval = in[in_x_0 + w * (in_y_0 + h * c0)]; int argmax_y = in_y_0; int argmax_x = in_x_0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { float v = in[x + offset_y]; if (maxval < v) { maxval = v; argmax_y = y; argmax_x = x; } } } out = maxval; int argmax_ky = argmax_y + ph - out_y * sy; int argmax_kx = argmax_x + pw - out_x * sx; indexes = argmax_kx + kw * argmax_ky; ''', 'max_pool_fwd')(x[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, y, self.indexes) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) if len(inputs) == 3: b = inputs[2] if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.empty((n, c, self.outh, self.outw), dtype=numpy.float32) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if len(inputs) == 3: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) one = numpy.array(1, dtype=x.dtype).ctypes zero = numpy.array(0, dtype=x.dtype).ctypes libcudnn.convolutionBackwardData( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if len(inputs) == 3: libcudnn.addTensor( handle, libcudnn.CUDNN_ADD_SAME_C, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.empty((n, c, kh, kw, in_h, in_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, x_mats[i], gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if len(inputs) == 3: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, x): if cudnn.enabled and self.use_cudnn: return super(MaxPooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) self.indexes = cuda.empty((n, c, y_h, y_w), dtype=numpy.int32) cuda.elementwise( ''' float* out, int* indexes, const float* in, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float maxval = in[in_x_0 + w * (in_y_0 + h * c0)]; int argmax_y = in_y_0; int argmax_x = in_x_0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { float v = in[x + offset_y]; if (maxval < v) { maxval = v; argmax_y = y; argmax_x = x; } } } out[i] = maxval; int argmax_ky = argmax_y + ph - out_y * sy; int argmax_kx = argmax_x + pw - out_x * sx; indexes[i] = argmax_kx + kw * argmax_ky; ''', 'max_pool_fwd')(y, self.indexes, x[0], h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) return y,
def forward_gpu(self, x): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = self.W.shape[2] # When indices are enclosed with [], they are 'flatten' # (i.e. linealized as 1-D array) # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32) # '[ij],[ik]->[ijk]' cuda.elementwise( 'float* y, float* e1, float* e2, int e1c, int e2c', ''' int I = i / e1c / e2c; int J = (i - I * e1c * e2c) / e2c; int K = i % e2c; y[i] = e1[I * e1c + J] * e2[I * e2c + K]; ''', 'row_wise_outer_product')( e1e2, e1, e2, j_len, k_len) # [ijk]->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = self.W.reshape( self.W.shape[0] * self.W.shape[1], self.W.shape[2]) y = cuda.empty((i_len, l_len), dtype=numpy.float32) with cuda.using_cumisc(): # 'i[jk],[jk]l->il' cuda.culinalg.dot(e1e2, W_mat, out=y) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,jl->il cuda.culinalg.add_dot(e1, self.V1, y) # ik,kl->il cuda.culinalg.add_dot(e2, self.V2, y) cuda.elementwise( 'float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def forward_gpu(self, inputs): x, t = inputs max_length = cuda.reduce( 'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]', 'max(a,b)', '0', 'binary_hierarchical_softmax_max_length', numpy.int32 )(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.empty((length,), dtype=numpy.float32) n_in = x.shape[1] wxy = cuda.empty((length,), dtype=numpy.float32) cuda.elementwise( '''float* ls, float* wxy, const float* x, const float* w, const int* ts, const int* paths, const float* codes, const int* begins, int c, int max_length''', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; x = &x[ind * c]; float wx = 0; for (int j = 0; j < c; ++j) { wx += w[node * c + j] * x[j]; } wxy[i] = wx * codes[p]; ls[i] = log(1 + exp(-wxy[i])); } else { ls[i] = 0; } ''', 'binary_hierarchical_softmax_forward' )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins, n_in, max_length) self.max_length = max_length self.wxy = wxy return cuda.gpuarray.sum(ls),
def forward_gpu(self, inputs): x, t = inputs max_length = cuda.reduce( 'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]', 'max(a,b)', '0', 'binary_hierarchical_softmax_max_length', numpy.int32 )(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.empty((length,), dtype=numpy.float32) n_in = x.shape[1] wxy = cuda.empty((length,), dtype=numpy.float32) cuda.elementwise( '''float* ls, float* wxy, const float* x, const float* w, const int* ts, const int* paths, const float* codes, const int* begins, int c, int max_length''', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; x = &x[ind * c]; float wx = 0; for (int j = 0; j < c; ++j) { wx += w[node * c + j] * x[j]; } wxy[i] = wx * codes[p]; ls[i] = log(1 + exp(-wxy[i])); } else { ls[i] = 0; } ''', 'binary_hierarchical_softmax_forward' )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins, n_in, max_length) self.max_length = max_length self.wxy = wxy return cuda.gpuarray.sum(ls),
def col2im_gpu(col, sy, sx, ph, pw, h, w): n, c, kh, kw, out_h, out_w = col.shape img = cuda.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( ''' float* img, const float* col, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; val += col[out_x + out_w * (out_y + out_h * (kx + kw * (ky + kh * c0)))]; } } img[i] = val; ''', 'col2im')(img, col, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw) return img
def forward_gpu(self, x): if cuda.cudnn_enabled and self.use_cudnn: return super(AveragePooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) coeff = 1. / (self.kh * self.kw) cuda.elementwise( 'raw T in, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T out', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float val = 0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { val += in[x + offset_y]; } } out = val * coeff; ''', 'avg_pool_fwd')(x[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff, y) return y,
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all) out_w = get_conv_outsize(w, kw, sx, pw, cover_all) col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( 'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T col', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky + out_y * sy - ph; int in_x = kx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col = img[in_x + w * (in_y + h * c0)]; } else { col = 0; } ''', 'im2col')(img.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, col) return col
def hotdot(a, indices, out=None, dont_add=False): """ In: a: a pycuda gpuarray indices: hot indices a K-hot encoded matrix out: out: x.dot(a.T), where x is a K-hot encoded matrix """ H, D = a.shape N, K = indices.shape if N == 1: bdim, gdim = Get_bdim_and_gdimRowVec(H) elif H >= (N*4): bdim, gdim = Get_bdim_and_gdimSmallNBigM(N,H) else: bdim, gdim = Get_bdim_and_gdim2D(N,H) if dont_add: B = np.int32(1) else: B = np.int32(0) if out is None: out = cuda.empty((N,H), dtype=np.float32) B = np.int32(1) if K > 1: HotDot1_kernel.prepared_call(gdim, bdim, a.gpudata, out.gpudata, indices.gpudata, np.int32(K), np.int32(N), np.int32(H), np.int32(D), np.int32(B)) else: HotDot2_kernel.prepared_call(gdim, bdim, a.gpudata, out.gpudata, indices.gpudata, np.int32(N), np.int32(H), np.int32(D), np.int32(B)) return out
def forward_gpu(self, x): if cudnn.enabled and self.use_cudnn: return super(AveragePooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) coeff = 1. / (self.kh * self.kw) cuda.elementwise( ''' float* out, const float* in, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw, float coeff ''', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float val = 0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { val += in[x + offset_y]; } } out[i] = val * coeff; ''', 'avg_pool_fwd')(y, x[0], h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff) return y,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward(handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0], ), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all) out_w = get_conv_outsize(w, kw, sx, pw, cover_all) col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( ''' float* col, const float* img, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky + out_y * sy - ph; int in_x = kx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col[i] = img[in_x + w * (in_y + h * c0)]; } else { col[i] = 0; } ''', 'im2col')(col, img, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw) return col
def col2im_gpu(col, sy, sx, ph, pw, h, w): n, c, kh, kw, out_h, out_w = col.shape img = cuda.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( 'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T img', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); T val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; int k = out_y + out_h * (kx + kw * (ky + kh * c0)); val += col[out_x + out_w * k]; } } img = val; ''', 'col2im')(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, img) return img
def forward_gpu(self, x): xshape = x[0].shape self.cdimx = xshape[self.axis] self.rdim = numpy.prod(xshape[self.axis + 1:]) indices = self.indices_or_sections if isinstance(indices, collections.Iterable): indices = list(indices) indices.append(xshape[self.axis]) else: if xshape[self.axis] % indices: raise ValueError( 'array split does not result in an equal division') indices = six.moves.range( indices, xshape[self.axis] + indices, indices) ys = [] kernel = cuda.elementwise( _args, 'COPY(y[i] = x[idx])', 'split_fwd', preamble=_preamble) bi = 0 for i in indices: i = min(i, xshape[self.axis]) cdimy = max(0, i - bi) s = list(xshape) s[self.axis] = cdimy y = cuda.empty(s, dtype=x[0].dtype) if cdimy != 0: kernel(y, x[0], cdimy, self.cdimx, self.rdim, bi) bi = i ys.append(y) return tuple(ys)
def col2im_gpu(col, sy, sx, ph, pw, h, w): n, c, kh, kw, out_h, out_w = col.shape img = cuda.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( ''' float* img, const float* col, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; val += col[out_x + out_w * (out_y + out_h * (kx + kw * (ky + kh * c0)))]; } } img[i] = val; ''', 'col2im')(img, col, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw) return img
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all) out_w = get_conv_outsize(w, kw, sx, pw, cover_all) col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( ''' float* col, const float* img, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw ''', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky + out_y * sy - ph; int in_x = kx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col[i] = img[in_x + w * (in_y + h * c0)]; } else { col[i] = 0; } ''', 'im2col')(col, img, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw) return col
def forward_gpu(self, x): if cudnn.enabled and self.use_cudnn: return super(AveragePooling2D, self).forward_gpu(x) n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) coeff = 1. / (self.kh * self.kw) cuda.elementwise( ''' float* out, const float* in, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw, float coeff ''', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float val = 0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { val += in[x + offset_y]; } } out[i] = val * coeff; ''', 'avg_pool_fwd')(y, x[0], h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff) return y,
def forward_gpu(self, x): xshape = x[0].shape self.cdimx = xshape[self.axis] self.rdim = numpy.prod(xshape[self.axis + 1:]) if isinstance(self.indices_or_sections, collections.Iterable): ind = list(self.indices_or_sections) ind.append(self.cdimx) else: sec = self.indices_or_sections if self.cdimx % sec: raise ValueError( 'array split does not result in an equal division') ind = numpy.arange(1, sec + 1) * (self.cdimx // sec) ys = [] kernel = cuda.elementwise( _args, 'COPY(y[i] = x[idx])', 'split_fwd', preamble=_preamble) prev_i = 0 for i in ind: cdimy = max(0, min(i, self.cdimx) - prev_i) s = list(xshape) s[self.axis] = cdimy y = cuda.empty(s, dtype=x[0].dtype) if cdimy == 0: raise ValueError('Not support if shape contains 0') kernel(y, x[0], cdimy, self.cdimx, self.rdim, prev_i) prev_i = i ys.append(y) return tuple(ys)
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False): n, c, h, w = img.shape out_h = get_conv_outsize(h, kh, sy, ph, cover_all) out_w = get_conv_outsize(w, kw, sx, pw, cover_all) col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype) cuda.elementwise( 'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T col', ''' int c0 = i / (kh * kw * out_h * out_w); int ky = i / (kw * out_h * out_w) % kh; int kx = i / (out_h * out_w) % kw; int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y = ky + out_y * sy - ph; int in_x = kx + out_x * sx - pw; if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) { col = img[in_x + w * (in_y + h * c0)]; } else { col = 0; } ''', 'im2col')(img.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, col) return col
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr( self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def forward_gpu(self, x): xshape = x[0].shape self.cdimx = xshape[self.axis] self.rdim = numpy.prod(xshape[self.axis + 1:], dtype=int) if isinstance(self.indices_or_sections, collections.Iterable): ind = list(self.indices_or_sections) ind.append(self.cdimx) else: sec = self.indices_or_sections if self.cdimx % sec: raise ValueError( 'array split does not result in an equal division') ind = numpy.arange(1, sec + 1) * (self.cdimx // sec) ys = [] kernel = cuda.elementwise(_args, 'COPY(y[i] = x[idx])', 'split_fwd', preamble=_preamble) prev_i = 0 for i in ind: cdimy = max(0, min(i, self.cdimx) - prev_i) s = list(xshape) s[self.axis] = cdimy y = cuda.empty(tuple(s), dtype=x[0].dtype) if cdimy == 0: raise ValueError('Not support if shape contains 0') kernel(y, x[0], cdimy, self.cdimx, self.rdim, prev_i) prev_i = i ys.append(y) return tuple(ys)
def forward_gpu(self, x): a, b = x shape = self._output_shape(a, b) ret = cuda.empty(shape) _batch_matmul_gpu(a, b, transa=self.transa, transb=self.transb, out=ret) return ret,
def forward_gpu(self, x): y = cuda.empty((x[0].size, self.W.shape[1]), dtype=numpy.float32) cuda.elementwise( 'float* y, const float* W, const int* x, int n_out', 'y[i] = W[x[i / n_out] * n_out + i % n_out]', 'embed_id_fwd')(y, self.W, x[0], self.W.shape[1]) return y,
def col2im_gpu(col, sy, sx, ph, pw, h, w): n, c, kh, kw, out_h, out_w = col.shape img = cuda.empty((n, c, h, w), dtype=col.dtype) cuda.elementwise( 'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T img', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); T val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { int ky = y - out_y * sy; for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int kx = x - out_x * sx; int k = out_y + out_h * (kx + kw * (ky + kh * c0)); val += col[out_x + out_w * k]; } } img = val; ''', 'col2im')(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, img) return img
def forward_gpu(self, inputs): x, t = inputs max_length = cuda.reduce( 'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]', 'max(a, b)', 'out = a', '0', 'binary_hierarchical_softmax_max_length')(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.empty((length,), dtype=numpy.float32) n_in = x.shape[1] wxy = cuda.empty((length,), dtype=numpy.float32) cuda.elementwise( '''raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, int32 c, int32 max_length''', 'T ls, T wxy', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T wx = 0; for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; wx += w[w_ind] * x[x_ind]; } wxy = wx * codes[p]; ls = log(1 + exp(-wxy)); } else { ls = 0; } ''', 'binary_hierarchical_softmax_forward' )(x, self.W, t, self.paths, self.codes, self.begins, n_in, max_length, ls, wxy) self.max_length = max_length self.wxy = wxy return ls.sum(),
def forward_gpu(self, x): n, out_c, out_h, out_w = x[0].shape c = self.W.shape[1] h = get_deconv_outsize(out_h, self.kh, self.sy, self.ph) w = get_deconv_outsize(out_w, self.kw, self.sx, self.pw) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], out_h, out_w) y = cuda.empty((n, c, h, w), dtype=numpy.float32) y_desc = cudnn.get_tensor_desc(y, h, w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), x_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 0, y_desc.value, cudnn.get_ptr(y)) if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: handle = cuda.get_cublas_handle() # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) gcol = cuda.empty((n, c, self.kh, self.kw, out_h, out_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, x_mats[i], transa='T', handle=handle, out=gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, c, h * w) return y,
def forward_gpu(self, inputs): x, t = inputs max_length = cuda.reduce( 'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]', 'max(a, b)', 'out = a', '0', 'binary_hierarchical_softmax_max_length')(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.empty((length, ), dtype=numpy.float32) n_in = x.shape[1] wxy = cuda.empty((length, ), dtype=numpy.float32) cuda.elementwise( '''raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, int32 c, int32 max_length''', 'T ls, T wxy', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T wx = 0; for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; wx += w[w_ind] * x[x_ind]; } wxy = wx * codes[p]; ls = log(1 + exp(-wxy)); } else { ls = 0; } ''', 'binary_hierarchical_softmax_forward')(x, self.W, t, self.paths, self.codes, self.begins, n_in, max_length, ls, wxy) self.max_length = max_length self.wxy = wxy return ls.sum(),
def forward_gpu(self, x): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = self.W.shape[2] # When indices are enclosed with [], they are 'flatten' # (i.e. linealized as 1-D array) # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32) # '[ij],[ik]->[ijk]' cuda.elementwise( 'float* y, float* e1, float* e2, int e1c, int e2c', ''' int I = i / e1c / e2c; int J = (i - I * e1c * e2c) / e2c; int K = i % e2c; y[i] = e1[I * e1c + J] * e2[I * e2c + K]; ''', 'row_wise_outer_product')(e1e2, e1, e2, j_len, k_len) # [ijk]->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = self.W.reshape(self.W.shape[0] * self.W.shape[1], self.W.shape[2]) y = cuda.empty((i_len, l_len), dtype=numpy.float32) with cuda.using_cumisc(): # 'i[jk],[jk]l->il' cuda.culinalg.dot(e1e2, W_mat, out=y) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,jl->il cuda.culinalg.add_dot(e1, self.V1, y) # ik,kl->il cuda.culinalg.add_dot(e2, self.V2, y) cuda.elementwise('float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, inputs, grads): x, = inputs g, = grads gx = cuda.empty(x.shape, numpy.float32) cuda.elementwise( 'float* gx, const float* x, const float* g, float beta', 'gx[i] = (1.f - 1.f / (1.f + __expf(beta * x[i]))) * g[i];', 'softplus_backward')(gx, x, g, self.beta) return gx,
def sample_gpu(self, shape): ps = cuda.empty(shape, numpy.float32) cuda.get_generator().fill_uniform(ps) vs = cuda.empty(shape, numpy.int32) cuda.elementwise( '''int* vs, const float* ps, const float* threshold, const int* values, int b''', ''' float pb = ps[i] * b; int index = __float2int_rd(pb); // fill_uniform sometimes returns 1.0, so we need to check index if (index >= b) { index = 0; } int lr = threshold[index] < pb - index; vs[i] = values[index * 2 + lr]; ''', 'walker_alias_sample')(vs, ps, self.threshold, self.values, len(self.threshold)) return vs
def forward_gpu(self, inputs): x, = inputs y = cuda.empty(x.shape) cuda.elementwise( 'float* y, const float* x, float beta, float beta_inv', ''' float bx = beta * x[i]; y[i] = (max(bx, 0.f) + log1pf(__expf(-fabsf(bx)))) * beta_inv; ''', 'softplus')(y, x, self.beta, self.beta_inv) return y,
def forward_gpu(self, x): x = _as_mat(x[0]) y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype) with cuda.using_cumisc(): cuda.culinalg.dot(x, self.W, transb='T', out=y) if self.b is not None: cuda.elementwise('float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, inputs, grads): x, = inputs g, = grads gx = cuda.empty(x.shape, numpy.float32) cuda.elementwise( 'float* gx, const float* x, const float* g, float beta', 'gx[i] = (1.f - 1.f / (1.f + __expf(beta * x[i]))) * g[i];', 'softplus_backward' )(gx, x, g, self.beta) return gx,
def backward_gpu(self, x, gy): # x is a dummy variable, which is required only for compatibility with pooling_2d.Pooling2D n, c, h, w = gy[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all) y = cuda.empty((n, c, y_h, y_w), dtype=gy[0].dtype) gx = cuda.empty((n, c, y_h, y_w), dtype=x[0].dtype) cuda.elementwise( 'raw T in, int32 h, int32 w, int32 out_h, int32 out_w,' 'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T out', ''' int c0 = i / (out_h * out_w); int out_y = i / out_w % out_h; int out_x = i % out_w; int in_y_0 = max(0, out_y * sy - ph); int in_y_1 = min(h, out_y * sy + kh - ph); int in_x_0 = max(0, out_x * sx - pw); int in_x_1 = min(w, out_x * sx + kw - pw); float maxval = in[in_x_0 + w * (in_y_0 + h * c0)]; int argmax_y = in_y_0; int argmax_x = in_x_0; for (int y = in_y_0; y < in_y_1; ++y) { int offset_y = w * (y + h * c0); for (int x = in_x_0; x < in_x_1; ++x) { float v = in[x + offset_y]; if (maxval < v) { maxval = v; argmax_y = y; argmax_x = x; } } } out = maxval; int argmax_ky = argmax_y + ph - out_y * sy; int argmax_kx = argmax_x + pw - out_x * sx; ''', 'max_pool_fwd')(gy[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, gx) return gx,
def forward_gpu(self, x): x = _as_mat(x[0]) y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype) with cuda.using_cumisc(): cuda.culinalg.dot(x, self.W, transb='T', out=y) if self.b is not None: cuda.elementwise( 'float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, inputs, grad_outputs): y, z = inputs gcost, = grad_outputs N = y.shape[0] gy = cuda.empty(y.shape) gz = cuda.empty(z.shape) cuda.culinalg.add_dot(self.z_centered, self.covariance, gy, transb='T', alpha=1. / N, beta=0.) cuda.culinalg.add_dot(self.y_centered, self.covariance, gz, alpha=1. / N, beta=0.) gy = cuda.cumisc.multiply(gy, gcost) gz = cuda.cumisc.multiply(gz, gcost) return gy, gz
def forward_gpu(self, inputs): y, z = inputs # Center inputs y_mean = cuda.empty((1, y.shape[1])) z_mean = cuda.empty((1, z.shape[1])) cuda.cumisc.mean(y, axis=0, out=y_mean, keepdims=True) cuda.cumisc.mean(z, axis=0, out=z_mean, keepdims=True) self.y_centered = cuda.cumisc.subtract(y, y_mean) self.z_centered = cuda.cumisc.subtract(z, z_mean) # Calculate cross-covariance self.covariance = cuda.empty((y.shape[1], z.shape[1])) cuda.culinalg.add_dot( self.y_centered, self.z_centered, self.covariance, transa="T", alpha=1.0 / y.shape[0], beta=0.0 ) # Calculate cost cost = cuda.cumisc.sum(0.5 * self.covariance ** 2) return (cost,)
def sample_gpu(self, shape): ps = cuda.empty(shape, numpy.float32) cuda.get_generator().fill_uniform(ps) vs = cuda.empty(shape, numpy.int32) cuda.elementwise( '''int* vs, const float* ps, const float* threshold, const int* values, int b''', ''' float pb = ps[i] * b; int index = __float2int_rd(pb); // fill_uniform sometimes returns 1.0, so we need to check index if (index >= b) { index = 0; } int lr = threshold[index] < pb - index; vs[i] = values[index * 2 + lr]; ''', 'walker_alias_sample' )(vs, ps, self.threshold, self.values, len(self.threshold)) return vs
def forward_gpu(self, inputs): x, = inputs y = cuda.empty(x.shape) cuda.elementwise( 'float* y, const float* x, float beta, float beta_inv', ''' float bx = beta * x[i]; y[i] = (max(bx, 0.f) + log1pf(__expf(-fabsf(bx)))) * beta_inv; ''', 'softplus' )(y, x, self.beta, self.beta_inv) return y,
def _partial_reduce(x): global _one out_axis, sum_axis = x.shape one = _one if one is None or one.size < sum_axis: one = cuda.ones(sum_axis) _one = one one = one[:sum_axis] handle = cuda.get_cublas_handle() ret = cuda.empty(out_axis) cuda.cublas.cublasSgemv(handle, 't', sum_axis, out_axis, numpy.float32(1.0), x.gpudata, sum_axis, one.gpudata, 1, numpy.float32(0.0), ret.gpudata, 1) return ret
def forward_gpu(self, inputs): mean, ln_var = inputs if self.eps is None: self.eps = cuda.empty(ln_var.shape, numpy.float32) cuda.get_generator().fill_normal(self.eps) noise = cuda.empty_like(ln_var) cuda.elementwise( 'float* noise, const float* v, const float* e', 'noise[i] = __expf(v[i] * 0.5f) * e[i];', 'gaussian_forward' )(noise, ln_var, self.eps) self.noise = noise return mean + self.noise,
def _partial_reduce(x): global _one out_axis, sum_axis = x.shape one = _one if one is None or one.size < sum_axis: one = cuda.ones(sum_axis) _one = one one = one[:sum_axis] handle = cuda.get_cublas_handle() ret = cuda.empty(out_axis) cuda.cublas.cublasSgemv(handle, 't', sum_axis, out_axis, numpy.float32(1.0), x.gpudata, sum_axis, one.gpudata, 1, numpy.float32(0.0), ret.gpudata, 1) return ret
def forward_gpu(self, inputs): mean, ln_var = inputs if self.eps is None: self.eps = cuda.empty(ln_var.shape, numpy.float32) cuda.get_generator().fill_normal(self.eps) noise = cuda.empty_like(ln_var) cuda.elementwise( 'float* noise, const float* v, const float* e', 'noise[i] = __expf(v[i] * 0.5f) * e[i];', 'gaussian_forward' )(noise, ln_var, self.eps) self.noise = noise return mean + self.noise,
def forward_gpu(self, inputs): y, z = inputs # Center inputs y_mean = cuda.empty((1, y.shape[1])) z_mean = cuda.empty((1, z.shape[1])) cuda.cumisc.mean(y, axis=0, out=y_mean, keepdims=True) cuda.cumisc.mean(z, axis=0, out=z_mean, keepdims=True) self.y_centered = cuda.cumisc.subtract(y, y_mean) self.z_centered = cuda.cumisc.subtract(z, z_mean) # Calculate cross-covariance self.covariance = cuda.empty((y.shape[1], z.shape[1])) cuda.culinalg.add_dot(self.y_centered, self.z_centered, self.covariance, transa='T', alpha=1. / y.shape[0], beta=0.) # Calculate cost cost = cuda.cumisc.sum(0.5 * self.covariance**2) return cost,
def getByIndex_LogAndClip(probs, t, out=None): """ This kernel takes an element in each row of probs at indices t, and clips the output from 1e-8 to 1 and takes the log """ N, M = probs.shape bdim, gdim = Get_bdim_and_gdim1D(N) if out is None: out = cuda.empty((N,1),dtype=np.float32) IndexAndClipAndLog_kernel.prepared_call(gdim, bdim, probs.gpudata, t.gpudata, out.gpudata, np.int32(N), np.int32(M)) return out
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32) c = x[0].shape[1] cuda.elementwise( 'float* maxes, const float* x, int c', ''' const float* row = x + i * c; float maxval = row[0]; for (int j = 1; j < c; ++j) { if (maxval < row[j]) { maxval = row[j]; } } maxes[i] = maxval; ''', 'softmax_rowmax')(maxes, x[0], c) cuda.elementwise( 'float* y, const float* x, const float* maxes, int c', 'y[i] = __expf(x[i] - maxes[i / c])', 'softmax_exp')(y, x[0], maxes, c) coeff = maxes # reuse memory cuda.elementwise( 'float* coeff, const float* y, int c', ''' const float* row = y + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } coeff[i] = 1 / sum; ''', 'softmax_invrowsum')(coeff, y, c) cuda.elementwise( 'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]', 'softmax_rowmul')(y, coeff, c) self.y = y return y,
def forward_gpu(self, x): # Implementation using cudnn n, c, h, w = x[0].shape y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all) y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) handle = cudnn.get_default_handle() pool_desc = self.create_pool_desc() x_desc = cudnn.get_tensor_desc(x[0], x[0].shape[2], x[0].shape[3]) y_desc = cudnn.get_tensor_desc(y, y_h, y_w) libcudnn.cudnnPoolingForward( handle, pool_desc.value, 1, x_desc.value, cudnn.get_ptr(x[0]), 0, y_desc.value, cudnn.get_ptr(y)) self.y = y return y,
def forward_gpu(self, inputs): x, t = inputs fragments = cuda.empty((x.shape[0], ), dtype=numpy.int8) cuda.elementwise( 'char* fragments, const float* x, const int* t, int c', ''' x += i * c; float maxval = x[0]; int argmax = 0; for (int j = 1; j < c; ++j) { if (maxval < x[j]) { maxval = x[j]; argmax = j; } } fragments[i] = argmax == t[i]; ''', 'accuracy_fwd_map')(fragments, x, t, x.shape[1]) y = cuda.gpuarray.sum(fragments, dtype=numpy.float32) y /= x.shape[0] return y,
def forward_gpu(self, xs): # TODO(beam2d): Unify the process into a single kernel. shape = list(xs[0].shape) for x in xs[1:]: shape[self.axis] += x.shape[self.axis] self.shape = shape y = cuda.empty(shape, dtype=xs[0].dtype) self.cdimy = y.shape[self.axis] self.rdim = numpy.prod(shape[self.axis + 1:]) coffset = 0 kernel = cuda.elementwise( _args, 'COPY(y[idx] = x[i])', 'concat_fwd', preamble=_preamble) for x in xs: cdimx = x.shape[self.axis] kernel(x, y, cdimx, self.cdimy, self.rdim, coffset) coffset += cdimx return y,
def forward_gpu(self, x): # Implementation using cudnn n, c, h, w = x[0].shape y_h = conv.get_conv_outsize( h, self.kh, self.sy, self.ph, self.cover_all) y_w = conv.get_conv_outsize( w, self.kw, self.sx, self.pw, self.cover_all) y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32) handle = cudnn.get_handle() pool_desc = self.create_pool_desc() x_desc = cudnn.create_tensor_descriptor(x[0]) y_desc = cudnn.create_tensor_descriptor(y) libcudnn.poolingForward( handle, pool_desc.value, ctypes.c_float(1), x_desc.value, x[0].data.ptr, ctypes.c_float(0), y_desc.value, y.data.ptr) self.y = y return y,