def backward(self, out_grad, in_data, out_data, in_grad): l=in_data[1] y=out_data[0] dx=in_grad[0] if self.bwd_kernel==None: self.bwd_kernel=mx.rtc('softmax_grad', [('y',y),('l',l)], [('dx',dx)]) self.bwd_kernel.push([y,l],[dx], (y.shape[0],1,1), (y.shape[1],1,1))
def backward(self, out_grad, in_data, out_data, in_grad): l = in_data[1] y = out_data[0] dx = in_grad[0] if self.bwd_kernel is None: self.bwd_kernel = mx.rtc('softmax_grad', [('y', y), ('l', l)], [('dx', dx)], """ int i = blockIdx.x; int j = threadIdx.x; int k = static_cast<int>(l[i]); if (j == k) { dx[i*dx_dims[1]+j] = y[i*dx_dims[1]+j] - 1.0f; } else { dx[i*dx_dims[1]+j] = y[i*dx_dims[1]+j]; } """) self.bwd_kernel.push([y,l], [dx], (y.shape[0],1,1), (y.shape[1], 1, 1))
def forward(self, in_data, out_data): nn, source = in_data target = out_data[0] if self.fwd_kernel is None: self.fwd_kernel = mx.rtc('assignpatch', [('nn', nn), ('source', source)], [('target', target)], """ int target_idx = threadIdx.x*target_dims[3]*target_dims[2]+blockIdx.x*target_dims[3]+blockIdx.y; int source_idx = nn[blockIdx.x*nn_dims[1]+blockIdx.y]*source_dims[1]*source_dims[2]*source_dims[3] + threadIdx.x*source_dims[2]*source_dims[3]; for (int i = 0; i < source_dims[2]; i++){ for (int j = 0; j < source_dims[3]; j++){ atomicAdd(target+target_idx, source[source_idx]); target_idx++; source_idx++; } target_idx += target_dims[3]-source_dims[3]; } """) self.fwd_kernel.push([nn, source], [target], (target.shape[2]-source.shape[2]+1, target.shape[3]-source.shape[3]+1, 1), (source.shape[1],1,1))
def forward(self, in_data, out_data): nn, source = in_data target = out_data[0] if self.fwd_kernel is None: self.fwd_kernel = mx.rtc('assignpatch', [('nn', nn), ('source', source)], [('target', target)], """ int target_idx = threadIdx.x*target_dims[3]*target_dims[2]+blockIdx.x*target_dims[3]+blockIdx.y; int source_idx = nn[blockIdx.x*nn_dims[1]+blockIdx.y]*source_dims[1]*source_dims[2]*source_dims[3] + threadIdx.x*source_dims[2]*source_dims[3]; for (int i = 0; i < source_dims[2]; i++){ for (int j = 0; j < source_dims[3]; j++){ atomicAdd(target+target_idx, source[source_idx]); target_idx++; source_idx++; } target_idx += target_dims[3]-source_dims[3]; } """) self.fwd_kernel.push([nn, source], [target], (target.shape[2] - source.shape[2] + 1, target.shape[3] - source.shape[3] + 1, 1), (source.shape[1], 1, 1))
def forward(self, in_data, out_data): x = in_data[0] y = out_data[0] if self.fwd_kernel is None: self.fwd_kernel = mx.rtc('softmax', [('x', x)], [('y', y)], """ int i = threadIdx.x + blockIdx.x*blockDim.x; float max_x = x[i*x_dims[1]]; for (int j = 1; j < x_dims[1]; ++j) { if (max_x < x[i*x_dims[1]+j]) { max_x = x[i*x_dims[1]+j]; } } float sum = 0.0f; for (int j = 0; j < x_dims[1]; ++j) { sum += expf(x[i*x_dims[1]+j]-max_x); } for (int j = 0; j < x_dims[1]; ++j) { y[i*x_dims[1]+j] = expf(x[i*x_dims[1]+j]-max_x)/sum; } """) self.fwd_kernel.push([x], [y], (1, 1, 1), (x.shape[0], 1, 1))
# pylint: skip-file import mxnet as mx import numpy as np from numpy.testing import assert_allclose if __name__ == '__main__': x = mx.nd.zeros((10,), ctx=mx.gpu(0)) x[:] = 1 y = mx.nd.zeros((10,), ctx=mx.gpu(0)) y[:] = 2 rtc = mx.rtc('abc', [('x', x)], [('y', y)], """ __shared__ float s_rec[10]; s_rec[threadIdx.x] = x[threadIdx.x]; y[threadIdx.x] = expf(s_rec[threadIdx.x]*5.0);""") rtc.push([x], [y], (1, 1, 1), (10,1,1)) assert_allclose(y.asnumpy(), np.exp(x.asnumpy()*5.0))
# pylint: skip-file import mxnet as mx import numpy as np from numpy.testing import assert_allclose if __name__ == '__main__': x = mx.nd.zeros((10, ), ctx=mx.gpu(0)) x[:] = 1 y = mx.nd.zeros((10, ), ctx=mx.gpu(0)) y[:] = 2 rtc = mx.rtc( 'abc', [('x', x)], [('y', y)], """ __shared__ float s_rec[10]; s_rec[hipThreadIdx_x] = x[hipThreadIdx_x]; y[hipThreadIdx_x] = expf(s_rec[hipThreadIdx_x]*5.0);""") rtc.push([x], [y], (1, 1, 1), (10, 1, 1)) assert_allclose(y.asnumpy(), np.exp(x.asnumpy() * 5.0))