def forward_gpu(self, inputs): x, targets = inputs N = x.shape[0] #Linear function z = cuda.empty((N,self.no_labels), dtype=np.float32) cuk.dot(x, self.W, out=z, transb='t') if not self.nobias: cuk.addVec2Mat(z, self.b) self.probs = z if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(z, 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(z), 0, desc.value, cudnn.get_ptr(self.probs)) else: cuk.softmax(z, self.probs) if self.return_probs: return self.probs, if self.compute_loss: correct_probs = cuda.empty((N,),dtype=np.float32) cuk.getByIndex_LogAndClip( self.probs, targets, out=correct_probs) loss = -cuda.cumisc.sum(correct_probs, keepdims=True)/N else: loss = np.atleast_2d(np.array(np.nan,dtype=np.float32)) return loss,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], 1, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32) c = x[0].shape[1] cuda.elementwise( 'float* maxes, const float* x, int c', ''' const float* row = x + i * c; float maxval = row[0]; for (int j = 1; j < c; ++j) { if (maxval < row[j]) { maxval = row[j]; } } maxes[i] = maxval; ''', 'softmax_rowmax')(maxes, x[0], c) cuda.elementwise( 'float* y, const float* x, const float* maxes, int c', 'y[i] = __expf(x[i] - maxes[i / c])', 'softmax_exp')(y, x[0], maxes, c) coeff = maxes # reuse memory cuda.elementwise( 'float* coeff, const float* y, int c', ''' const float* row = y + i * c; float sum = 0; for (int j = 0; j < c; ++j) { sum += row[j]; } coeff[i] = 1 / sum; ''', 'softmax_invrowsum')(coeff, y, c) cuda.elementwise( 'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]', 'softmax_rowmul')(y, coeff, c) self.y = y return y,
def forward_gpu(self, x): y = cuda.empty_like(x[0]) n_unit = int(numpy.prod(x[0].shape[2:])) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() desc = cudnn.get_tensor_desc(x[0], n_unit, 1) libcudnn.cudnnSoftmaxForward( handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(y)) self.y = y else: maxes_shape = (x[0].shape[0],) + x[0].shape[2:] maxes = cuda.empty(maxes_shape, dtype=numpy.float32) c = x[0].shape[1] cuda.elementwise( 'float* maxes, const float* x, int n_channel, int n_unit', ''' const int n = i / n_unit; const int m = i % n_unit; const float* row = x + n * n_channel * n_unit + m; float maxval = row[0]; for (int c = 1; c < n_channel; ++c) { const int v = c * n_unit; if (maxval < row[v]) { maxval = row[v]; } } maxes[i] = maxval; ''', 'softmax_rowmax')(maxes, x[0], c, n_unit) cuda.elementwise( ''' float* y, const float* x, const float* maxes, int n_channel, int n_unit ''', ''' const int n = i / (n_channel * n_unit); const int m = i % n_unit; y[i] = __expf(x[i] - maxes[n * n_unit + m]); ''', 'softmax_exp')(y, x[0], maxes, c, n_unit) coeff = maxes # reuse memory cuda.elementwise( 'float* coeff, const float* y, int n_channel, int n_unit', ''' const int n = i / n_unit; const int m = i % n_unit; const float* row = y + n * n_channel * n_unit + m; float sum = 0; for (int c = 0; c < n_channel; ++c) { sum += row[c * n_unit]; } coeff[i] = 1 / sum; ''', 'softmax_invrowsum')(coeff, y, c, n_unit) cuda.elementwise( 'float* y, const float* coeff, int n_channel, int n_unit', ''' const int n = i / (n_channel * n_unit); const int m = i % n_unit; y[i] *= coeff[n * n_unit + m]; ''', 'softmax_rowmul')(y, coeff, c, n_unit) self.y = y return y,