def backward_gpu(self, inputs, grads): f, z = inputs[:2] gh, = grads b, t, c = f.shape gz = cuda.cupy.zeros_like(gh) cuda.raw( ''' #define THREADS_PER_BLOCK 32 extern "C" __global__ void strnn_back( const CArray<float, 3> f, const CArray<float, 3> gh, CArray<float, 3> gz) { int index[3]; const int t_size = f.shape()[1]; index[0] = blockIdx.x; index[2] = blockIdx.y * THREADS_PER_BLOCK + threadIdx.x; index[1] = t_size - 1; float &gz_last = gz[index]; gz_last = gh[index]; float prev_gz = gz_last; for (int i = t_size - 1; i > 0; i--){ index[1] = i; const float ft = f[index]; index[1] = i - 1; const float ght = gh[index]; float &gzt = gz[index]; prev_gz = prev_gz * ft + ght; gzt = prev_gz; } }''', 'strnn_back')((b, c // THREADS_PER_BLOCK), (THREADS_PER_BLOCK, ), (f, gh, gz)) gf = self.h[:, :-1, :] * gz ghinit = f[:, 0, :] * gz[:, 0, :] return gf, gz, ghinit
def forward_gpu(self, inputs): f, z, hinit = inputs b, t, c = f.shape assert c % THREADS_PER_BLOCK == 0 self.h = cuda.cupy.zeros((b, t + 1, c), dtype=np.float32) self.h[:, 0, :] = hinit cuda.raw( ''' #define THREADS_PER_BLOCK 32 extern "C" __global__ void strnn_fwd( const CArray<float, 3> f, const CArray<float, 3> z, CArray<float, 3> h) { int index[3]; const int t_size = f.shape()[1]; index[0] = blockIdx.x; index[1] = 0; index[2] = blockIdx.y * THREADS_PER_BLOCK + threadIdx.x; float prev_h = h[index]; for (int i = 0; i < t_size; i++){ index[1] = i; const float ft = f[index]; const float zt = z[index]; index[1] = i + 1; float &ht = h[index]; prev_h = prev_h * ft + zt; ht = prev_h; } }''', 'strnn_fwd')((b, c // THREADS_PER_BLOCK), (THREADS_PER_BLOCK, ), (f, z, self.h)) return self.h[:, 1:, :],