def test_reduce_exp(self): random = np.random.RandomState(0) original_shape = [1, 5, 4, 2] x = random.randn(*original_shape).reshape([-1]) dx = np.zeros_like(x) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) dx_c = cuda.to_device(dx, stream=stream) # call kernel cols = np.prod(original_shape[:3]) reduce.reduce_exp(x_c, dx_c, rows=original_shape[-1], cols=cols, minus=False, stream=stream) # sync kernel stream.synchronize() dx_result = dx_c.copy_to_host(stream=stream) del x_c, dx_c # collect results in first [B * T * U] values; for all V assert (dx_result[cols:] - dx[cols:]).sum() <= 1e-7 # make sure dx_result updates the [B * T * U] values assert np.abs(dx_result[:cols] - dx[:cols]).sum() > 0
def log_softmax(self, acts: torch.Tensor, denom: torch.Tensor): """ Computes the log softmax denominator of the input activation tensor and stores the result in denom. Args: acts: Activation tensor of shape [B, T, U, V+1]. The input must be represented as a flat tensor of shape [B * T * U * (V+1)] to allow pointer indexing. denom: A zero tensor of same shape as acts. Updates: This kernel inplace updates the `denom` tensor """ # // trans_acts + pred_acts -> log_softmax denominator reduce.reduce_max( acts, denom, rows=self.alphabet_size_, cols=self.minibatch_ * self.maxT_ * self.maxU_, minus=False, stream=self.stream_, ) reduce.reduce_exp( acts, denom, rows=self.alphabet_size_, cols=self.minibatch_ * self.maxT_ * self.maxU_, minus=True, stream=self.stream_, )
def test_compute_alphas_kernel(self): numba_utils.skip_numba_cuda_test_if_unsupported( __NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape # Numpy kernel x = random.randn(*original_shape) labels = np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]]) # [1, 10] label_len = len(labels[0]) + 1 blank_idx = 0 x_np = log_softmax(x, axis=-1) ground_alphas, ground_log_likelihood = rnnt_numpy.forward_pass( x_np[0, :, :label_len, :], labels[0, :label_len - 1], blank_idx) # Pytorch kernel device = torch.device('cuda') if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(device).cuda_stream) else: stream = cuda.default_stream() x_c = torch.tensor(x, device=device, dtype=torch.float32) labels_c = torch.tensor(labels, device=device, dtype=torch.int32) # Allocate workspace memory denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) llForward = torch.zeros(B, device=device, dtype=x_c.dtype) input_lengths = torch.tensor([T], dtype=torch.int32, device=device) label_lengths = torch.tensor([len(labels[0])], dtype=torch.int32, device=device) # certify input data certify_inputs(x_c, labels_c, input_lengths, label_lengths) # flatten activation tensor (for pointer based indexing) x_c = x_c.view([-1]) # call kernel # log softmax reduction reduce.reduce_max(x_c, denom, rows=V, cols=B * T * U, minus=False, stream=stream) reduce.reduce_exp(x_c, denom, rows=V, cols=B * T * U, minus=True, stream=stream) # alpha kernel gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0]( x_c, denom, alphas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # sync kernel stream.synchronize() # reshape alphas alphas = alphas.view([B, T, U]) diff = ground_alphas - alphas[0].cpu().numpy() assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10 ll_diff = ground_log_likelihood - llForward[0].cpu().numpy() assert np.abs(ll_diff).mean() <= 1e-5 assert np.square(ll_diff).mean() <= 1e-10
def test_compute_grads_kernel_clamp(self): numba_utils.skip_numba_cuda_test_if_unsupported( __NUMBA_MINIMUM_VERSION__) fastemit_lambda = 0.0 clamp = 0.1 random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape # Numpy kernel x = random.randn(*original_shape) labels = torch.from_numpy( np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]], dtype=np.int32)) # [1, 10] audio_len = torch.from_numpy(np.array([T], dtype=np.int32)) label_len = torch.from_numpy(np.array([U - 1], dtype=np.int32)) blank_idx = 0 x_np = torch.from_numpy(x) x_np.requires_grad_(True) """ Here we will directly utilize the numpy variant of the loss without explicitly calling the numpy functions for alpha, beta and grads. This is because the grads returned by the rnnt_numpy.transduce_batch() are : d/dx (alpha + beta alignment)(log_softmax(x)). But according to the chain rule, we'd still need to compute the gradient of log_softmax(x) and update the alignments by hand. Instead, we will rely on pytorch to compute the gradient of the log_softmax(x) step and propagate it backwards. """ loss_func = rnnt_numpy.RNNTLoss(blank_idx, fastemit_lambda=fastemit_lambda, clamp=clamp) loss_val = loss_func(x_np, labels, audio_len, label_len) loss_val.sum().backward() true_grads = x_np.grad # Pytorch kernel device = torch.device('cuda') if hasattr(cuda, 'external_stream'): stream = cuda.external_stream( torch.cuda.current_stream(device).cuda_stream) else: stream = cuda.default_stream() x_c = torch.tensor(x, device=device, dtype=torch.float32) labels_c = torch.tensor(labels, device=device, dtype=torch.int32) # Allocate workspace memory denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) betas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype) llForward = torch.zeros(B, device=device, dtype=x_c.dtype) llBackward = torch.zeros(B, device=device, dtype=x_c.dtype) input_lengths = torch.tensor([T], dtype=torch.int32, device=device) label_lengths = torch.tensor([len(labels[0])], dtype=torch.int32, device=device) # certify input data certify_inputs(x_c, labels_c, input_lengths, label_lengths) # flatten activation tensor (for pointer based indexing) x_c = x_c.view([-1]) grads = torch.zeros_like(x_c, requires_grad=False) # call kernel # log softmax reduction reduce.reduce_max(x_c, denom, rows=V, cols=B * T * U, minus=False, stream=stream) reduce.reduce_exp(x_c, denom, rows=V, cols=B * T * U, minus=True, stream=stream) # alpha kernel gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0]( x_c, denom, alphas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # beta kernel gpu_rnnt_kernel.compute_betas_kernel[B, U, stream, 0]( x_c, denom, betas, llBackward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, ) # gamma kernel grad_blocks_per_grid = B * T * U grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, stream, 0]( grads, x_c, denom, alphas, betas, llForward, input_lengths, label_lengths, labels_c, B, T, U, V, blank_idx, fastemit_lambda, clamp, ) # sync kernel stream.synchronize() # reshape grads grads = grads.view([B, T, U, V]) diff = true_grads - grads[0].cpu().numpy() assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10