def __init__(self, in_size, out_size, wscale=1, bias=0, nobias=False, initialW=None, initial_bias=None): self.W = None self.gW = None self.b = None self.gb = None if initialW is not None: assert initialW.shape == (out_size, in_size) self.W = initialW else: self.W = numpy.random.normal( 0, wscale * math.sqrt(1. / in_size), (out_size, in_size)).astype(numpy.float32) if isinstance(self.W, cuda.ndarray): self.gW = cuda.empty_like(self.W) else: self.gW = numpy.empty_like(self.W) if initial_bias is not None: assert initial_bias.shape == (out_size,) self.b = initial_bias elif not nobias: self.b = numpy.repeat(numpy.float32(bias), out_size) if self.b is not None: if isinstance(self.b, cuda.ndarray): self.gb = cuda.empty_like(self.b) else: self.gb = numpy.empty_like(self.b)
def __init__(self, in_size, out_size, wscale=1, bias=0, nobias=False, initialW=None, initial_bias=None): self.W = None self.gW = None self.b = None self.gb = None if initialW is not None: assert initialW.shape == (out_size, in_size) self.W = initialW else: self.W = numpy.random.normal( 0, wscale * math.sqrt(1. / in_size), (out_size, in_size)).astype(numpy.float32) if isinstance(self.W, cuda.GPUArray): self.gW = cuda.empty_like(self.W) else: self.gW = numpy.empty_like(self.W) if initial_bias is not None: assert initial_bias.shape == (out_size,) self.b = initial_bias elif not nobias: self.b = numpy.repeat(numpy.float32(bias), out_size) if self.b is not None: if isinstance(self.b, cuda.GPUArray): self.gb = cuda.empty_like(self.b) else: self.gb = numpy.empty_like(self.b)
def backward_gpu(self, inputs, grad_outputs): gh = grad_outputs[0] x, h_tm1 = inputs N = x.shape[0] gz = cuda.empty_like(gh) if self.act_func_str in ('tanh', 'sigmoid'): #backpropagate non-linearities gz = self.cu_dact_func(gy=gh, y=self.h, out=gz) # compute gradient with respect to the hidden input state gh_tm1 = cuk.dot(gz, self.V, out=self.h) elif self.act_func_str in ('leakyrelu', 'relu'): #backpropagate non-linearities gz = self.cu_dact_func(x=self.z, gy=gh, out=gz) # compute gradient with respect to the hidden input state gh_tm1 = cuk.dot(gz, self.V, out=self.z) else: raise NotImplementedError('the activation function is not available') #backpropagate linear function if self.hot: gx = None cuk.dothot(gz, x, in_size=self.in_size, out=self.gW) else: gx = cuda.empty_like(x) cuk.dot(gz, self.W, out=gx) cuk.dotAdd(gz, x, C=self.gW, transa='t') cuk.dotAdd(gz, h_tm1, C=self.gV, transa='t') if not self.nobias: gb_ones = cuda.ones((1,N),dtype=np.float32) cuk.dotAdd(gb_ones, gz, C=self.gb) return gx, gh_tm1
def backward_gpu(self, x, gy): gx0 = cuda.empty_like(x[0]) gx1 = cuda.empty_like(x[1]) cuda.elementwise( 'float* gx0, float* gx1, const float* x0, const float* x1, const float* gy', '''gx0[i] = gy[i] * x1[i]; gx1[i] = gy[i] * x0[i];''', 'mul_bwd')(gx0, gx1, x[0], x[1], gy[0]) return gx0, gx1
def backward_gpu(self, x, gy): gx0 = cuda.empty_like(x[0]) gx1 = cuda.empty_like(x[1]) cuda.elementwise( 'float* gx0, float* gx1, const float* x0, const float* x1, const float* gy', '''gx0[i] = x1[i] * __powf(x0[i], x1[i] - 1) * gy[i]; gx1[i] = __logf(x0[i]) * __powf(x0[i], x1[i]) * gy[i];''', 'pow_var_var_bwd')(gx0, gx1, x[0], x[1], gy[0]) return gx0, gx1
def backward_gpu(self, x, gy): gx0 = cuda.empty_like(x[0]) gx1 = cuda.empty_like(x[1]) cuda.elementwise( "float* gx0, float* gx1, const float* x0, const float* x1, const float* gy", """gx0[i] = gy[i] * x1[i]; gx1[i] = gy[i] * x0[i];""", "mul_bwd", )(gx0, gx1, x[0], x[1], gy[0]) return gx0, gx1
def backward_gpu(self, inputs, gy): x0, x1 = inputs gx0 = cuda.empty_like(x0) gx1 = cuda.empty_like(x1) coeff = gy[0] * (2. / x0.size) cuda.elementwise( '''float* gx0, float* gx1, const float* x0, const float* x1, const float* coeff''', '''gx0[i] = *coeff * (x0[i] - x1[i]); gx1[i] = -gx0[i];''', 'mse_bwd')(gx0, gx1, x0, x1, coeff) return gx0, gx1
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) gy_arr = gy[0] if not gy_arr.flags.c_contiguous: gy_arr = cuda.cupy.ascontiguousarray(gy_arr) gy_desc = cudnn.create_tensor_descriptor(gy_arr) one = ctypes.c_float(1) zero = ctypes.c_float(0) if self.b is not None: libcudnn.convolutionBackwardBias(handle, one, gy_desc.value, gy_arr.data.ptr, one, self.bias_desc.value, self.gb.data.ptr) libcudnn.convolutionBackwardFilter(handle, one, x_desc.value, x[0].data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, one, self.filter_desc.value, self.gW.data.ptr) gx = cuda.empty_like(x[0]) libcudnn.convolutionBackwardData(handle, one, self.filter_desc.value, self.W.data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, zero, x_desc.value, gx.data.ptr) else: if self.gb is not None: self.gb += gy[0].sum(axis=(0, 2, 3)) # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape(n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, gy_mats[i], gcol_mats[i]) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w) if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) libcudnn.cudnnConvolutionBackwardFilter( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) gx = cuda.empty_like(x[0]) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 0, x_desc.value, cudnn.get_ptr(gx)) else: handle = cuda.get_cublas_handle() if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * out_c, out_h * out_w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0) self.gb += tmp # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) gy_arr = gy[0] if not gy_arr.flags.c_contiguous: gy_arr = cuda.cupy.ascontiguousarray(gy_arr) gy_desc = cudnn.create_tensor_descriptor(gy_arr) one = ctypes.c_float(1) zero = ctypes.c_float(0) if self.b is not None: libcudnn.convolutionBackwardBias( handle, one, gy_desc.value, gy_arr.data.ptr, one, self.bias_desc.value, self.gb.data.ptr) libcudnn.convolutionBackwardFilter( handle, one, x_desc.value, x[0].data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, one, self.filter_desc.value, self.gW.data.ptr) gx = cuda.empty_like(x[0]) libcudnn.convolutionBackwardData( handle, one, self.filter_desc.value, self.W.data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, zero, x_desc.value, gx.data.ptr) else: handle = cuda.get_cublas_handle() if self.gb is not None: self.gb += gy[0].sum(axis=(0, 2, 3)) # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, wscale=1, bias=0, nobias=False, use_cudnn=True, initialW=None, initial_bias=None, dtype=numpy.float32): self.dtype = numpy.dtype(dtype) ksize = _pair(ksize) stride = _pair(stride) pad = _pair(pad) self.kh, self.kw = ksize self.sy, self.sx = stride self.ph, self.pw = pad self.in_channels = in_channels self.out_channels = out_channels self.W = None self.gW = None self.b = None self.gb = None if initialW is not None: assert initialW.shape == \ (out_channels, in_channels, self.kh, self.kw) self.W = initialW else: self.W = numpy.random.normal( 0, wscale * math.sqrt(1. / (self.kh * self.kw * in_channels)), (out_channels, in_channels, self.kh, self.kw) ).astype(self.dtype) if isinstance(self.W, cuda.ndarray): self.gW = cuda.empty_like(self.W) else: self.gW = numpy.empty_like(self.W) if initial_bias is not None: assert initial_bias.shape == (out_channels,) self.b = initial_bias elif not nobias: self.b = numpy.repeat(self.dtype.type(bias), out_channels) if self.b is not None: if isinstance(self.b, cuda.ndarray): self.gb = cuda.empty_like(self.b) else: self.gb = numpy.empty_like(self.b) self.use_cudnn = use_cudnn if cuda.cudnn_enabled and use_cudnn: # chance to choose implicit-precomp-gemm algorithm self.max_workspace_size = in_channels * self.kh * self.kw * 4
def backward_gpu(self, inputs, gy): x0, x1 = inputs gx0 = cuda.empty_like(x0) gx1 = cuda.empty_like(x1) coeff = gy[0] * (2.0 / x0.size) cuda.elementwise( """float* gx0, float* gx1, const float* x0, const float* x1, const float* coeff""", """gx0[i] = *coeff * (x0[i] - x1[i]); gx1[i] = -gx0[i];""", "mse_bwd", )(gx0, gx1, x0, x1, coeff) return gx0, gx1
def forward_gpu(self, x): self.rand = cuda.empty_like(x[0]) y = cuda.empty_like(x[0]) cuda.get_generator().fill_uniform(self.rand) self.scale = 1. / (1 - self.dropout_ratio) self.kernel = cuda.elementwise( '''float* y, const float* x, const float* rand, float dropout_ratio, float scale''', 'y[i] = rand[i] < dropout_ratio ? 0 : scale * x[i]', 'dropout') self.kernel(y, x[0], self.rand, self.dropout_ratio, self.scale) return y,
def backward_gpu(self, x_orig, gy): # TODO(beam2d): Support backprop on inference mode assert self.use_batch_mean and not self.is_finetune ldim, cdim, rdim = self._internal_shape(x_orig[0]) x = x_orig[0].reshape(ldim, cdim, rdim) gy = gy[0].reshape(ldim, cdim, rdim) m = ldim * rdim mean, sqmean = _cusum_axis02(x, mean=True) stdinv = sqmean # reuse buffer cuda.elementwise( 'float* stdinv, const float* mean, float eps', 'stdinv[i] = rsqrtf(stdinv[i] - mean[i] * mean[i] + eps)', 'bn_stdinv')(stdinv, mean, self.eps) x_hat = cuda.empty_like(x) gx = cuda.empty_like(x) _kernel_with_I( ''' float* x_hat, const float* x, const float* mean, const float* stdinv ''', 'x_hat[i] = (x[i] - mean[I]) * stdinv[I]', 'bn_x_hat')(x_hat, x, mean, stdinv, cdim, rdim) mean = None gbeta, ggamma = _cusum_axis02(gy, x_hat, expr2='x[I] * y[I]') cuda.elementwise( ''' float* self_ggammma, const float* ggamma, float* slef_gbeta, const float* gbeta ''',''' self_ggammma[i] += ggamma[i]; slef_gbeta[i] += gbeta[i]; ''','bn_add')( self.ggamma, ggamma, self.gbeta, gbeta) _kernel_with_I( ''' float* gx, const float* x_hat, const float* gy, const float* stdinv, const float* ggamma, const float* gbeta, const float* gamma, float inv_m ''',''' gx[i] = gamma[I] * stdinv[I] * (gy[i] - (x_hat[i] * ggamma[I] + gbeta[I]) * inv_m) ''','bn_bwd')( gx, x_hat, gy, stdinv, ggamma, gbeta, self.gamma, 1. / m, cdim, rdim) return gx.reshape(x_orig[0].shape),
def backward_gpu(self, x_orig, gy): # TODO(beam2d): Support backprop on inference mode assert self.use_batch_mean and not self.is_finetune ldim, cdim, rdim = self._internal_shape(x_orig[0]) x = x_orig[0].reshape(ldim, cdim, rdim) gy = gy[0].reshape(ldim, cdim, rdim) m = ldim * rdim mean, sqmean = _cusum_axis02(x, mean=True) stdinv = sqmean # reuse buffer cuda.elementwise( 'float* stdinv, const float* mean, float eps', 'stdinv[i] = rsqrtf(stdinv[i] - mean[i] * mean[i] + eps)', 'bn_stdinv')(stdinv, mean, self.eps) x_hat = cuda.empty_like(x) gx = cuda.empty_like(x) _kernel_with_I( ''' float* x_hat, const float* x, const float* mean, const float* stdinv ''', 'x_hat[i] = (x[i] - mean[I]) * stdinv[I]', 'bn_x_hat')(x_hat, x, mean, stdinv, cdim, rdim) mean = None gbeta, ggamma = _cusum_axis02(gy, x_hat, expr2='x[I] * y[I]') cuda.elementwise( ''' float* self_ggammma, const float* ggamma, float* slef_gbeta, const float* gbeta ''', ''' self_ggammma[i] += ggamma[i]; slef_gbeta[i] += gbeta[i]; ''', 'bn_add')( self.ggamma, ggamma, self.gbeta, gbeta) _kernel_with_I( ''' float* gx, const float* x_hat, const float* gy, const float* stdinv, const float* ggamma, const float* gbeta, const float* gamma, float inv_m ''', ''' gx[i] = gamma[I] * stdinv[I] * (gy[i] - (x_hat[i] * ggamma[I] + gbeta[I]) * inv_m) ''', 'bn_bwd')( gx, x_hat, gy, stdinv, ggamma, gbeta, self.gamma, 1. / m, cdim, rdim) return gx.reshape(x_orig[0].shape),
def forward_gpu(self, x_orig): ldim, cdim, rdim = self._internal_shape(x_orig[0]) x = x_orig[0].reshape(ldim, cdim, rdim) if self.use_batch_mean: mean = _cumean_axis02(x) sqmean = _cumean_axis02(x * x) var = sqmean # reuse buffer cuda.elementwise( 'float* var, const float* mean, const float* sqmean, float eps', 'var[i] = sqmean[i] - mean[i] * mean[i] + eps', 'bn_var')(var, mean, sqmean, self.eps) else: mean = self.avg_mean var = self.avg_var coeff = cuda.empty_like(var) bias = cuda.empty_like(var) y = cuda.empty_like(x_orig[0]) cuda.elementwise( '''float* coeff, float* bias, const float* mean, const float* var, const float* gamma, const float* beta''', '''coeff[i] = rsqrtf(var[i]) * gamma[i]; bias[i] = beta[i] - coeff[i] * mean[i];''', 'bn_fwd_prep')(coeff, bias, mean, var, self.gamma, self.beta) _kernel_with_I( 'float* y, const float* x, const float* coeff, const float* bias', 'y[i] = coeff[I] * x[i] + bias[I]', 'bn_fwd')(y, x, coeff, bias, cdim, rdim) # Compute exponential moving average if self.use_batch_mean: if self.is_finetune: self.N[0] += 1 decay = 1. / self.N[0] else: decay = self.decay m = ldim * rdim adjust = m / max(m - 1., 1.) # unbiased estimation kern = cuda.elementwise( 'float* mean, const float* x, float decay, float adjust', 'mean[i] = decay * mean[i] + (1 - decay) * adjust * x[i]', 'bn_moving_avg') kern(self.avg_mean, mean, decay, adjust) kern(self.avg_var, var, decay, adjust) return y,
def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, wscale=1, bias=0, nobias=False, use_cudnn=True, initialW=None, initial_bias=None): ksize = _pair(ksize) stride = _pair(stride) pad = _pair(pad) self.kh, self.kw = ksize self.sy, self.sx = stride self.ph, self.pw = pad self.in_channels = in_channels self.out_channels = out_channels self.W = None self.gW = None self.b = None self.gb = None if initialW is not None: assert initialW.shape == \ (out_channels, in_channels, self.kh, self.kw) self.W = initialW else: self.W = numpy.random.normal( 0, wscale * math.sqrt(1. / (self.kh * self.kw * in_channels)), (out_channels, in_channels, self.kh, self.kw) ).astype(numpy.float32) if isinstance(self.W, cuda.GPUArray): self.gW = cuda.empty_like(self.W) else: self.gW = numpy.empty_like(self.W) if initial_bias is not None: assert initial_bias.shape == (out_channels,) self.b = initial_bias elif not nobias: self.b = numpy.repeat(numpy.float32(bias), out_channels) if self.b is not None: if isinstance(self.b, cuda.GPUArray): self.gb = cuda.empty_like(self.b) else: self.gb = numpy.empty_like(self.b) self.use_cudnn = use_cudnn if cudnn.enabled and use_cudnn: # chance to choose implicit-precomp-gemm algorithm self.max_workspace_size = in_channels * self.kh * self.kw * 4
def backward_gpu(self, x, gy): summand = cuda.empty_like(x[0]) cuda.elementwise( '''float* summand, const float* scale, const float* y, const float* gy''', 'summand[i] = y[i] * gy[i] / scale[i]', 'lrn_bwd_summand')(summand, self.scale, self.y, gy[0]) gx = cuda.empty_like(x[0]) _cu_conv_sum(gx, summand, self.n) cuda.elementwise( '''float* gx, const float* x, const float* gy, const float* scale, float beta, float coeff''', 'gx[i] = __powf(scale[i], -beta) * gy[i] - coeff * x[i] * gx[i]', 'lrn_bwd')(gx, x[0], gy[0], self.scale, self.beta, 2 * self.alpha * self.beta) return gx,
def forward_gpu(self, x): n, c, y_h, y_w = x[0].shape gx = cuda.empty_like( numpy.ones((n, c, self.h, self.w)).astype(numpy.float32)) cuda.elementwise( 'raw T gy, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw', 'T gx', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { int offset = out_x + out_w * (out_y + out_h * c0); val += gy[offset]; } } gx = val; ''', 'max_pool_bwd')(x[0].reduced_view(), self.h, self.w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, gx) return gx,
def forward_gpu(self, x): self._check_shape(x[0]) cdim = self.W.size rdim = x[0].size // (x[0].shape[0] * cdim) y = cuda.empty_like(x[0]) _fwd_kern()(y, x[0], x[0], self.W, cdim, rdim) return y,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx, const float* x, const float* gy, const float z', 'gx[i] = ((x[i] > 0) and (x[i] < z))? gy[i] : 0', 'clipped_relu_bwd')(gx, x[0], gy[0], self.cap) return gx,
def backward_gpu(self, x, gy): gx0 = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx0, const float* x0, const float* gy', 'gx0[i] = ((x0[i] > 0) - (x0[i] < 0)) * gy[i]', 'abs_bwd')(gx0, x[0], gy[0]) return gx0,
def forward_gpu(self, inputs): self.y = cuda.empty_like(inputs[0]) cuda.elementwise( ''' float* r, const int ngauss, const float* w, const float* m1, const float* m2, const float* s1, const float* s2, const float* c, const float* x1, const float* x2 ''', ''' const int j = i / ngauss; float z1 = (x1[j] - m1[i]) / s1[i]; float z2 = (x2[j] - m2[i]) / s2[i]; float z3; z1 = pow(z1 - c[i] * z2, 2.0f); z2 = 1.0f - pow(c[i], 2.0f); z3 = 2.0f * 3.141592654f * s1[i] * s2[i] * sqrt(z2); r[i] = w[i] * exp(- z1 / (2.0f * z2)) / z3; ''', 'gaussian_mixture_2d_fwd' )(self.y, self.y.shape[1], *inputs) return self.y,
def backward_gpu(self, inputs, grad_outputs): gx = cuda.empty_like(inputs[0]) cuda.elementwise( 'float* y, const float* b, const int n_channel', 'y[i] = b[i / n_channel]', 'sum_axis_bwd')(gx, grad_outputs[0], gx.shape[1]) return gx,
def backward_gpu(self, x, gy): if cuda.cudnn_enabled and self.use_cudnn: return super(AveragePooling2D, self).backward_gpu(x, gy) n, c, h, w = x[0].shape y_h, y_w = gy[0].shape[2:] gx = cuda.empty_like(x[0]) coeff = 1. / (self.kh * self.kw) cuda.elementwise( 'raw T gy, int32 h, int32 w,' 'int32 out_h, int32 out_w, int32 kh, int32 kw,' 'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T gx', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); int hc0 = out_h * c0; float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { val += gy[out_x + out_w * (out_y + hc0)]; } } gx = val * coeff; ''', 'avg_pool_bwd')(gy[0].reduced_view(), h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff, gx) return gx,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: return super(AveragePooling2D, self).backward_gpu(x, gy) n, c, h, w = x[0].shape y_h, y_w = gy[0].shape[2:] gx = cuda.empty_like(x[0]) coeff = 1. / (self.kh * self.kw) cuda.elementwise( ''' float* gx, const float* gy, int h, int w, int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw, float coeff ''', ''' int c0 = i / (h * w); int y = i / w % h + ph; int x = i % w + pw; int out_y_0 = max(0, (y - kh + sy) / sy); int out_y_1 = min(out_h, (y + sy) / sy); int out_x_0 = max(0, (x - kw + sx) / sx); int out_x_1 = min(out_w, (x + sx) / sx); int hc0 = out_h * c0; float val = 0; for (int out_y = out_y_0; out_y < out_y_1; ++out_y) { for (int out_x = out_x_0; out_x < out_x_1; ++out_x) { val += gy[out_x + out_w * (out_y + hc0)]; } } gx[i] = val * coeff; ''', 'avg_pool_bwd')(gx, gy[0], h, w, y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, coeff) return gx,
def backward_gpu(self, x, gy): summand = cuda.empty_like(x[0]) cuda.elementwise( '''float* summand, const float* scale, const float* y, const float* gy''', 'summand[i] = y[i] * gy[i] / scale[i]', 'lrn_bwd_summand')(summand, self.scale, self.y, gy[0]) gx = cuda.empty_like(x[0]) _cu_conv_sum(gx, summand, self.n) cuda.elementwise( '''float* gx, const float* x, const float* gy, const float* scale, float beta, float coeff''', 'gx[i] = powf(scale[i], -beta) * gy[i] - coeff * x[i] * gx[i]', 'lrn_bwd')(gx, x[0], gy[0], self.scale, self.beta, 2 * self.alpha * self.beta) return gx,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx, const float* x, const float* gy, float value', 'gx[i] = -value * gy[i] / (x[i] * x[i])', 'div_from_const_bwd')(gx, x[0], gy[0], self.value) return gx,
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx, const float* x, const float* gy, float value', 'gx[i] = value * __powf(x[i], value - 1) * gy[i]', 'pow_var_const_bwd')(gx, x[0], gy[0], self.value) return gx,
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr( self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def forward_gpu(self, inputs): c_prev, x = inputs lsize = c_prev.shape[0] * c_prev.shape[1] rsize = c_prev.size // lsize self.c = cuda.empty_like(c_prev) h = cuda.empty_like(c_prev) cuda.elementwise( '''float* c, float* h, const float* c_prev, const float* x, int lsize, int rsize''', '''COMMON_ROUTINE; c[i] = aa * ai + af * c_prev[i]; h[i] = ao * tanhf(c[i]);''', 'lstm_fwd', preamble=_preamble)(self.c, h, c_prev, x, lsize, rsize) return self.c, h
def backward_gpu(self, x, gy): if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gx = cuda.empty_like(x[0]) desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxBackward(handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(self.y), desc.value, cudnn.get_ptr(gy[0]), 0, desc.value, cudnn.get_ptr(gx)) else: gx = self.y * gy[0] c = gx.shape[1] sum_ydy = cuda.empty((gx.shape[0], ), dtype=numpy.float32) cuda.elementwise( 'float* sum_ydy, const float* ydy, int c', ''' const float* row = ydy + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } sum_ydy[i] = sum; ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c) cuda.elementwise( 'float* gx, const float* y, const float* sum_ydy, int c', 'gx[i] -= y[i] * sum_ydy[i / c]', 'softmax_bwd_diff')(gx, self.y, sum_ydy, c) return gx,
def backward_gpu(self, x, gy): logv = math.log(self.value) gx = cuda.empty_like(x[0]) cuda.elementwise( 'float* gx, const float* x, const float* gy, float value, float logv', 'gx[i] = logv * __powf(value, x[i]) * gy[i]', 'pow_const_var_bwd')(gx, x[0], gy[0], self.value, logv) return gx,
def backward_gpu(self, inputs, grad_outputs): c_prev, x = inputs gc, gh = grad_outputs lsize = c_prev.shape[0] * c_prev.shape[1] rsize = c_prev.size // lsize # Odd rule to determine whether the gradient is given or not. if gc is None: gc = self.c if gh is None: gh = self.c gc_prev = cuda.empty_like(c_prev) gx = cuda.empty_like(x) cuda.elementwise( """ float* gc_prev, float* gx, const float* c_prev, const float* x, const float* c, const float* gc, const float* gh, int lsize, int rsize """, """ COMMON_ROUTINE; float* gx_i = gx + I * 4 * rsize; float& ga = gx_i[ J]; float& gi = gx_i[ rsize + J]; float& gf = gx_i[2*rsize + J]; float& go = gx_i[3*rsize + J]; float co = tanhf(c[i]); // Odd rule: if gh == c [gc == c] then gh [gc] is not given, // since we cannot pass null pointer to the kernel through // PyCUDA. float gc1 = (gh == c ? 0 : gh[i] * ao * grad_tanh(co)) + (gc == c ? 0 : gc[i]); go = gh == c ? 0 : gh[i] * co * grad_sigmoid(ao); gc_prev[i] = gc1 * af; ga = gc1 * ai * grad_tanh(aa); gi = gc1 * aa * grad_sigmoid(ai); gf = gc1 * c_prev[i] * grad_sigmoid(af); """, "lstm_bwd", preamble=_preamble, )(gc_prev, gx, c_prev, x, self.c, gc, gh, lsize, rsize) return gc_prev, gx
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( "float* gx, const float* x, const float* gy, float value", "gx[i] = value * __powf(x[i], value - 1) * gy[i]", "pow_var_const_bwd", )(gx, x[0], gy[0], self.value) return (gx,)
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) cuda.elementwise( "float* gx, const float* x, const float* gy, float value", "gx[i] = -value * gy[i] / (x[i] * x[i])", "div_from_const_bwd", )(gx, x[0], gy[0], self.value) return (gx,)
def backward_gpu(self, inputs, grad_outputs): c_prev, x = inputs gc, gh = grad_outputs lsize = c_prev.shape[0] * c_prev.shape[1] rsize = c_prev.size // lsize # Odd rule to determine whether the gradient is given or not. if gc is None: gc = self.c if gh is None: gh = self.c gc_prev = cuda.empty_like(c_prev) gx = cuda.empty_like(x) cuda.elementwise(''' float* gc_prev, float* gx, const float* c_prev, const float* x, const float* c, const float* gc, const float* gh, int lsize, int rsize ''', ''' COMMON_ROUTINE; float* gx_i = gx + I * 4 * rsize; float& ga = gx_i[ J]; float& gi = gx_i[ rsize + J]; float& gf = gx_i[2*rsize + J]; float& go = gx_i[3*rsize + J]; float co = tanhf(c[i]); // Odd rule: if gh == c [gc == c] then gh [gc] is not given, // since we cannot pass null pointer to the kernel through // PyCUDA. float gc1 = (gh == c ? 0 : gh[i] * ao * grad_tanh(co)) + (gc == c ? 0 : gc[i]); go = gh == c ? 0 : gh[i] * co * grad_sigmoid(ao); gc_prev[i] = gc1 * af; ga = gc1 * ai * grad_tanh(aa); gi = gc1 * aa * grad_sigmoid(ai); gf = gc1 * c_prev[i] * grad_sigmoid(af); ''', 'lstm_bwd', preamble=_preamble)(gc_prev, gx, c_prev, x, self.c, gc, gh, lsize, rsize) return gc_prev, gx
def backward_gpu(self, x, gy): _x = _as_mat(x[0]) gx = cuda.empty_like(_x) with cuda.using_cumisc(): cuda.culinalg.add_dot(gy[0], _x, self.gW, transa='T') if self.gb is not None: self.gb += cuda.cumisc.sum(gy[0], 0) cuda.culinalg.dot(gy[0], self.W, out=gx) return gx.reshape(x[0].shape),
def backward_gpu(self, inputs, grad_outputs): t, gloss = inputs[1], grad_outputs[0] gx = cuda.empty_like(self.y) coeff = gloss / t.size cuda.elementwise( 'float* gx, const float* y, const int* t, const float* coeff, int n_channel', 'gx[i] = *coeff * (y[i] - ((i % n_channel) == t[i / n_channel]))', 'softmax_crossent_bwd')(gx, self.y, t, coeff, self.y.shape[1]) return gx, None
def backward_gpu(self, inputs, grad_outputs): t, gloss = inputs[1], grad_outputs[0] gx = cuda.empty_like(self.y) coeff = gloss / t.shape[0] cuda.elementwise( 'float* gx, const float* y, const int* t, const float* coeff', 'gx[i] = *coeff * (y[i] - t[i])', 'sigmoid_crossent_bwd')(gx, self.y, t, coeff) return gx, None
def forward_gpu(self, x): self.y = cuda.cupy.square(x[0]) # temporary self.scale = cuda.empty_like(self.y) _cu_conv_sum(self.scale, self.y, self.n) cuda.elementwise( 'T x, T k, T alpha, T beta', 'T y, T scale', '''scale = k + alpha * scale; y = x * pow(scale, -beta);''', 'lrn_fwd')(x[0], self.k, self.alpha, self.beta, self.y, self.scale) return self.y,
def forward_gpu(self, x): self.y = x[0] * x[0] # temporary self.scale = cuda.empty_like(self.y) _cu_conv_sum(self.scale, self.y, self.n) cuda.elementwise( '''float* y, float* scale, const float* x, float k, float alpha, float beta''', '''scale[i] = k + alpha * scale[i]; y[i] = x[i] * powf(scale[i], -beta);''', 'lrn_fwd')(self.y, self.scale, x[0], self.k, self.alpha, self.beta) return self.y,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if isinstance(self.value, Number): cuda.elementwise('float* y, const float* x, const float value', 'y[i] = powf(value, x[i])', 'pow_const_var_fwd')(y, x[0], self.value) else: cuda.elementwise('float* y, const float* x, const float *value', 'y[i] = powf(value[i], x[i])', 'pow_const_var_fwd')(y, x[0], self.value) return y,
def forward_gpu(self, x): self.y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnActivationForward(handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(self.y)) else: cuda.elementwise('float* y, const float* x', 'y[i] = tanhf(x[i])', 'tanh_fwd')(self.y, x[0]) return self.y,
def backward_gpu(self, x, gy): summand = cuda.elementwise('T scale, T y, T gy', 'T summand', 'summand = y * gy / scale', 'lrn_bwd_summand')(self.scale, self.y, gy[0]) gx = cuda.empty_like(x[0]) _cu_conv_sum(gx, summand, self.n) cuda.elementwise(' T x, T gy, T scale, T beta, T coeff', 'T gx', 'gx = pow(scale, -beta) * gy - coeff * x * gx', 'lrn_bwd')(x[0], gy[0], self.scale, self.beta, 2 * self.alpha * self.beta, gx) return gx,
def backward_gpu(self, xs, gy): gxs = tuple(cuda.empty_like(x) for x in xs) coffset = 0 kernel = cuda.elementwise( _args, 'COPY(x[i] = y[idx])', 'concat_bwd', preamble=_preamble) for gx in gxs: cdimx = gx.shape[self.axis] kernel(gx, gy[0], cdimx, self.cdimy, self.rdim, coffset) coffset += cdimx return gxs
def backward_gpu(self, x, gy): gx = cuda.empty_like(x[0]) if isinstance(self.value, Number): cuda.elementwise( 'float* gx, const float* x, const float* gy, const float value', 'gx[i] = value * __powf(x[i], value - 1) * gy[i]', 'pow_var_const_bwd')(gx, x[0], gy[0], self.value) else: cuda.elementwise( 'float* gx, const float* x, const float* gy, const float* value', 'gx[i] = value[i] * __powf(x[i], value[i] - 1) * gy[i]', 'pow_var_const_bwd')(gx, x[0], gy[0], self.value) return gx,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() desc = cudnn.create_tensor_descriptor(_as4darray(x[0])) libcudnn.activationForward(handle, _mode, ctypes.c_float(1), desc.value, x[0].data.ptr, ctypes.c_float(0), desc.value, y.data.ptr) self.y = y else: y = cuda.cupy.maximum(x[0].dtype.type(0), x[0]) return y,