def test_compute_alphas_kernel(self): numba_utils.skip_numba_cuda_test_if_unsupported( __NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape # Numpy kernel x = random.randn(*original_shape) labels = np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]]) # [1, 10] label_len = len(labels[0]) + 1 blank_idx = 0 x_np = log_softmax(x, axis=-1) ground_alphas, ground_log_likelihood = rnnt_numpy.forward_pass( x_np[0, :, :label_len, :], labels[0, :label_len - 1], blank_idx) # Pytorch kernel device = torch.device('cuda') if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(device).cuda_stream) else: stream = cuda.default_stream() x_c = torch.tensor(x, device=device, dtype=torch.float32) labels_c = torch.tensor(labels, device=device, dtype=torch.int32) # Allocate workspace memory denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) llForward = torch.zeros(B, device=device, dtype=x_c.dtype) input_lengths = torch.tensor([T], dtype=torch.int32, device=device) label_lengths = torch.tensor([len(labels[0])], dtype=torch.int32, device=device) # certify input data certify_inputs(x_c, labels_c, input_lengths, label_lengths) # flatten activation tensor (for pointer based indexing) x_c = x_c.view([-1]) # call kernel # log softmax reduction reduce.reduce_max(x_c, denom, rows=V, cols=B * T * U, minus=False, stream=stream) reduce.reduce_exp(x_c, denom, rows=V, cols=B * T * U, minus=True, stream=stream) # alpha kernel gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0]( x_c, denom, alphas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # sync kernel stream.synchronize() # reshape alphas alphas = alphas.view([B, T, U]) diff = ground_alphas - alphas[0].cpu().numpy() assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10 ll_diff = ground_log_likelihood - llForward[0].cpu().numpy() assert np.abs(ll_diff).mean() <= 1e-5 assert np.square(ll_diff).mean() <= 1e-10
def test_compute_grads_kernel_clamp(self): numba_utils.skip_numba_cuda_test_if_unsupported( __NUMBA_MINIMUM_VERSION__) fastemit_lambda = 0.0 clamp = 0.1 random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape # Numpy kernel x = random.randn(*original_shape) labels = torch.from_numpy( np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]], dtype=np.int32)) # [1, 10] audio_len = torch.from_numpy(np.array([T], dtype=np.int32)) label_len = torch.from_numpy(np.array([U - 1], dtype=np.int32)) blank_idx = 0 x_np = torch.from_numpy(x) x_np.requires_grad_(True) """ Here we will directly utilize the numpy variant of the loss without explicitly calling the numpy functions for alpha, beta and grads. This is because the grads returned by the rnnt_numpy.transduce_batch() are : d/dx (alpha + beta alignment)(log_softmax(x)). But according to the chain rule, we'd still need to compute the gradient of log_softmax(x) and update the alignments by hand. Instead, we will rely on pytorch to compute the gradient of the log_softmax(x) step and propagate it backwards. """ loss_func = rnnt_numpy.RNNTLoss(blank_idx, fastemit_lambda=fastemit_lambda, clamp=clamp) loss_val = loss_func(x_np, labels, audio_len, label_len) loss_val.sum().backward() true_grads = x_np.grad # Pytorch kernel device = torch.device('cuda') if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(device).cuda_stream) else: stream = cuda.default_stream() x_c = torch.tensor(x, device=device, dtype=torch.float32) labels_c = torch.tensor(labels, device=device, dtype=torch.int32) # Allocate workspace memory denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) betas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) llForward = torch.zeros(B, device=device, dtype=x_c.dtype) llBackward = torch.zeros(B, device=device, dtype=x_c.dtype) input_lengths = torch.tensor([T], dtype=torch.int32, device=device) label_lengths = torch.tensor([len(labels[0])], dtype=torch.int32, device=device) # certify input data certify_inputs(x_c, labels_c, input_lengths, label_lengths) # flatten activation tensor (for pointer based indexing) x_c = x_c.view([-1]) grads = torch.zeros_like(x_c, requires_grad=False) # call kernel # log softmax reduction reduce.reduce_max(x_c, denom, rows=V, cols=B * T * U, minus=False, stream=stream) reduce.reduce_exp(x_c, denom, rows=V, cols=B * T * U, minus=True, stream=stream) # alpha kernel gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0]( x_c, denom, alphas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # beta kernel gpu_rnnt_kernel.compute_betas_kernel[B, U, stream, 0]( x_c, denom, betas, llBackward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # gamma kernel grad_blocks_per_grid = B * T * U grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, stream, 0]( grads, x_c, denom, alphas, betas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, fastemit_lambda, clamp, ) # sync kernel stream.synchronize() # reshape grads grads = grads.view([B, T, U, V]) diff = true_grads - grads[0].cpu().numpy() assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10