def backward(self, top, propagate_down, bottom): with pu.caffe_cuda_context(): h = caffe.cublas_handle() import scikits.cuda.linalg as linalg top_diff = top[0].diff_as_pycuda_gpuarray() ts = [self.t1_, self.t2_] for i in xrange(len(bottom)): if not propagate_down[i]: continue diff = bottom[i].diff_as_pycuda_gpuarray() data = bottom[(i + 1) % 2].data_as_pycuda_gpuarray() # Belew 3 conditions are complicated and might be hard to # understand. swap = ts[i] ^ bool(i) t1 = ts[i] t2 = (not t1) ^ ts[(i + 1) % 2] for b in xrange(bottom[0].shape[0]): x = top_diff[b] y = data[b] t1_, t2_ = t1, t2 if swap: x, y = y, x t1_, t2_ = t2_, t1_ linalg.dot(x, y, transa=blas_trans(t1_), transb=blas_trans(t2_), handle=h, out=diff[b])
def forward(self, bottom, top): with pu.caffe_cuda_context(): h = caffe.cublas_handle() import scikits.cuda.linalg as linalg mat1 = bottom[0].data_as_pycuda_gpuarray() mat2 = bottom[1].data_as_pycuda_gpuarray() mato = top[0].data_as_pycuda_gpuarray() for b in xrange(bottom[0].shape[0]): linalg.dot(mat1[b], mat2[b], transa=blas_trans(self.t1_), transb=blas_trans(self.t2_), handle=h, out=mato[b])
def reshape(self, bottom, top): with pu.caffe_cuda_context(): batch_size = bottom[0].shape[0] if self.batch_size_ != batch_size: self.batch_size_ = batch_size self.diff_sum_ = gpuarray.zeros((batch_size, 1), dtype) self.diff2_sum_ = gpuarray.zeros((batch_size, 1), dtype) self.mask_sum_ = gpuarray.zeros((batch_size, 1), dtype) dim = int(np.prod(bottom[0].shape[1:])) if self.dim_ != dim: self.dim_ = dim self.multipier_sum_ = gpuarray.zeros((dim, 1), dtype) self.multipier_sum_.fill(dtype(1.0)) top[0].reshape()
def backward(self, top, propagate_down, bottom): """ Compute @f$\frac{\partial {\cal L}}{\partial y_bi}=\frac{\partial {\cal L}}{\partial d_i} \frac{\partial d_i} {\partial y_bi}@f$. @f$\frac{\partial {\cal L}}{\partial d_i}=\frac{2}{n}d_i' \left(d_i - \frac{\lambda}{n}\sum_j d_j\right). """ with pu.caffe_cuda_context(): pred = bottom[0].data_as_pycuda_gpuarray() label = bottom[1].data_as_pycuda_gpuarray() for i in xrange(len(bottom)): if propagate_down[i]: diff = bottom[i].diff_as_pycuda_gpuarray() sgn = 1 if i == 0 else -1 self.k_backward_(pred, label, self.diff_sum_, self.mask_sum_, sgn, top[0].diff, diff) if self.clip_gradient_ is not None: self.k_clip_gradient(diff)
def reshape(self, bottom, top): with pu.caffe_cuda_context(): batch_size = bottom[0].shape[0] if self.batch_size_ != batch_size: self.batch_size_ = batch_size self.diff_sum_ = gpuarray.zeros((batch_size, 1), dtype) self.diff2_sum_ = gpuarray.zeros((batch_size, 1), dtype) self.mask_sum_ = gpuarray.zeros((batch_size, 1), dtype) dim = int(np.prod(bottom[0].shape[1:])) if self.dim_ != dim: self.dim_ = dim self.multipier_sum_ = gpuarray.zeros((dim, 1), dtype) self.multipier_sum_.fill(dtype(1.0)) #if self.img_shape_ != bottom[0].shape: # Define mask to be 1. as we don't need to thresholding #self.mask_ = gpuarray.zeros(bottom[0].shape, dtype) #self.mask_.fill(dtype(1.0)) #self.img_shape_ = bottom[0].shape top[0].reshape()
def forward(self, bottom, top): """ """ with pu.caffe_cuda_context(): h = caffe.cublas_handle() batch_size = bottom[0].shape[0] dim = bottom[0].count / bottom[0].shape[0] pred = bottom[0].data_as_pycuda_gpuarray() label = bottom[1].data_as_pycuda_gpuarray() # Use bottom[0,1].diff as temporary buffer diff = bottom[0].diff_as_pycuda_gpuarray() diff2 = bottom[1].diff_as_pycuda_gpuarray() mask = bottom[0].diff_as_pycuda_gpuarray() # Compute diff self.k_masked_diff_(diff, pred, label) self.k_squared_(diff, diff2) import scikits.cuda.linalg as linalg # This needs scikits.cuda 0.5.0a3 or later # (sudo) pip install scikits.cuda=>0.5.0a3 linalg.dot(diff.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.diff_sum_) linalg.dot(diff2.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.diff2_sum_) mask.fill(dtype(1.0)) linalg.dot(mask.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.mask_sum_) self.k_ensure_mask_sum_(self.mask_sum_) term1 = self.k_div_sum_(self.diff2_sum_, self.mask_sum_) term2 = self.k_div_squared_sum_(self.diff_sum_, self.mask_sum_) top[0].data[...] = (term1.get() - self.lambda_ * term2.get()) \ / batch_size
def setup(self, bottom, top): assert len(bottom) == 2 assert len(top) == 1 # parameter param = eval(self.param_str) self.lambda_ = param['lambda'] self.clip_gradient_ = param.get('clip_gradient', None) # Create CUDA function with pu.caffe_cuda_context(): self.k_masked_diff_ = ElementwiseKernel( "float *diff, float *pred, float *label", "diff[i] = (pred[i] - label[i])", 'masked_diff') self.k_squared_ = ElementwiseKernel( "float *diff, float *diff2", "diff2[i] = diff[i] * diff[i]", 'squared') self.k_ensure_mask_sum_ = ElementwiseKernel( "float *mask_sum", "mask_sum[i] = max(mask_sum[i], 1.0f)", 'ensure_mask_sum') if self.clip_gradient_ is not None: self.k_clip_gradient = ElementwiseKernel( "float *diff", "diff[i] = fmaxf(-{0}, fminf(diff[i], {0}))".format( self.clip_gradient_), 'clip_gradient') # This should be computed more faster by cublasSdot self.k_sum_ = ReductionKernel(dtype, neutral="0", reduce_expr="a+b", map_expr="d[i]", arguments="float *d") self.k_squred_sum_ = ReductionKernel(dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] * d[i]", arguments="float *d") self.k_div_sum_ = ReductionKernel(dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] / m[i]", arguments="float *d, float *m") self.k_div_squared_sum_ = ReductionKernel( dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] * d[i] / (m[i] * m[i])", arguments="float *d, float *m") func_backward = SourceModule( """ #include <caffe/util/device_alternate.hpp> __global__ void backward(float *pred, float *label, float *diff_sum, float *mask_sum, int count, int stride, int sgn, int batch_size, float lambda, float loss_weight, float *diff) { CUDA_KERNEL_LOOP(i, count) { diff[i] = loss_weight * 2.0f * sgn / mask_sum[i / stride] / batch_size * ((pred[i] - label[i]) - lambda / mask_sum[i / stride] * diff_sum[i / stride]); } } """, include_dirs=pu.caffe_include_dirs).get_function("backward") func_backward.prepare("PPPPiiiiffP") def _func_backward(pred, label, ds, ms, sgn, loss_weight, diff): bg = pu.block_and_grid(pred.size) batch_size = pred.shape[0] count = pred.size stride = pred.size / pred.shape[0] func_backward.prepared_call(bg['grid'], bg['block'], pred.gpudata, label.gpudata, ds.gpudata, ms.gpudata, count, stride, sgn, batch_size, self.lambda_, loss_weight, diff.gpudata) self.k_backward_ = _func_backward self.batch_size_ = 0 self.dim_ = 0 #self.img_shape_ = None self.reshape(bottom, top)
def forward(self, bottom, top): with pu.caffe_cuda_context(): self.k_log_(bottom[0].data_as_pycuda_gpuarray(), top[0].data_as_pycuda_gpuarray(), np.float32(self.offset_))