Python reduce_max Examples

Programming Language: Python

Namespace/Package Name: nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils.reduce

Method/Function: reduce_max

Examples at hotexamples.com: 4

Python reduce_max - 4 examples found. These are the top rated real world Python examples of nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils.reduce.reduce_max extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def test_reduce_max(self):
        random = np.random.RandomState(0)
        original_shape = [1, 5, 4, 3]
        x = random.randn(*original_shape).reshape([-1])
        dx = random.randn(*x.shape)

        stream = cuda.stream()
        x_c = cuda.to_device(x, stream=stream)
        dx_c = cuda.to_device(dx, stream=stream)

        # call kernel
        cols = np.prod(original_shape[:3])
        reduce.reduce_max(x_c,
                          dx_c,
                          rows=original_shape[-1],
                          cols=cols,
                          minus=False,
                          stream=stream)

        # sync kernel
        stream.synchronize()

        dx_result = dx_c.copy_to_host(stream=stream)
        del x_c, dx_c

        # collect results in first [B * T * U] values; for all V
        assert np.abs(dx_result[cols:] - dx[cols:]).sum() <= 1e-7
        # make sure dx_result updates the [B * T * U] values
        assert np.abs(dx_result[:cols] - dx[:cols]).sum() > 0

Example #2

Show file

    def log_softmax(self, acts: torch.Tensor, denom: torch.Tensor):
        """
        Computes the log softmax denominator of the input activation tensor
        and stores the result in denom.

        Args:
            acts: Activation tensor of shape [B, T, U, V+1]. The input must be represented as a flat tensor
                of shape [B * T * U * (V+1)] to allow pointer indexing.
            denom: A zero tensor of same shape as acts.

        Updates:
            This kernel inplace updates the `denom` tensor
        """
        # // trans_acts + pred_acts -> log_softmax denominator
        reduce.reduce_max(
            acts,
            denom,
            rows=self.alphabet_size_,
            cols=self.minibatch_ * self.maxT_ * self.maxU_,
            minus=False,
            stream=self.stream_,
        )

        reduce.reduce_exp(
            acts,
            denom,
            rows=self.alphabet_size_,
            cols=self.minibatch_ * self.maxT_ * self.maxU_,
            minus=True,
            stream=self.stream_,
        )

Example #3

Show file

    def test_compute_alphas_kernel(self):
        numba_utils.skip_numba_cuda_test_if_unsupported(
            __NUMBA_MINIMUM_VERSION__)

        random = np.random.RandomState(0)
        original_shape = [1, 5, 11, 3]
        B, T, U, V = original_shape

        # Numpy kernel
        x = random.randn(*original_shape)
        labels = np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]])  # [1, 10]
        label_len = len(labels[0]) + 1
        blank_idx = 0

        x_np = log_softmax(x, axis=-1)
        ground_alphas, ground_log_likelihood = rnnt_numpy.forward_pass(
            x_np[0, :, :label_len, :], labels[0, :label_len - 1], blank_idx)

        # Pytorch kernel
        device = torch.device('cuda')
        if hasattr(cuda, 'external_stream'):
            stream = cuda.external_stream(
                torch.cuda.current_stream(device).cuda_stream)
        else:
            stream = cuda.default_stream()

        x_c = torch.tensor(x, device=device, dtype=torch.float32)
        labels_c = torch.tensor(labels, device=device, dtype=torch.int32)

        # Allocate workspace memory
        denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        llForward = torch.zeros(B, device=device, dtype=x_c.dtype)
        input_lengths = torch.tensor([T], dtype=torch.int32, device=device)
        label_lengths = torch.tensor([len(labels[0])],
                                     dtype=torch.int32,
                                     device=device)

        # certify input data
        certify_inputs(x_c, labels_c, input_lengths, label_lengths)

        # flatten activation tensor (for pointer based indexing)
        x_c = x_c.view([-1])

        # call kernel
        # log softmax reduction
        reduce.reduce_max(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=False,
                          stream=stream)
        reduce.reduce_exp(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=True,
                          stream=stream)

        # alpha kernel
        gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0](
            x_c,
            denom,
            alphas,
            llForward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # sync kernel
        stream.synchronize()

        # reshape alphas
        alphas = alphas.view([B, T, U])
        diff = ground_alphas - alphas[0].cpu().numpy()

        assert np.abs(diff).mean() <= 1e-5
        assert np.square(diff).mean() <= 1e-10

        ll_diff = ground_log_likelihood - llForward[0].cpu().numpy()

        assert np.abs(ll_diff).mean() <= 1e-5
        assert np.square(ll_diff).mean() <= 1e-10

Example #4

Show file

    def test_compute_grads_kernel_clamp(self):
        numba_utils.skip_numba_cuda_test_if_unsupported(
            __NUMBA_MINIMUM_VERSION__)

        fastemit_lambda = 0.0
        clamp = 0.1

        random = np.random.RandomState(0)
        original_shape = [1, 5, 11, 3]
        B, T, U, V = original_shape

        # Numpy kernel
        x = random.randn(*original_shape)
        labels = torch.from_numpy(
            np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]],
                     dtype=np.int32))  # [1, 10]
        audio_len = torch.from_numpy(np.array([T], dtype=np.int32))
        label_len = torch.from_numpy(np.array([U - 1], dtype=np.int32))
        blank_idx = 0

        x_np = torch.from_numpy(x)
        x_np.requires_grad_(True)
        """
        Here we will directly utilize the numpy variant of the loss without explicitly calling
        the numpy functions for alpha, beta and grads. 

        This is because the grads returned by the rnnt_numpy.transduce_batch() are :
        d/dx (alpha + beta alignment)(log_softmax(x)). 
        But according to the chain rule, we'd still need to compute the gradient of log_softmax(x)
        and update the alignments by hand. Instead, we will rely on pytorch to compute the gradient 
        of the log_softmax(x) step and propagate it backwards. 
        """
        loss_func = rnnt_numpy.RNNTLoss(blank_idx,
                                        fastemit_lambda=fastemit_lambda,
                                        clamp=clamp)
        loss_val = loss_func(x_np, labels, audio_len, label_len)
        loss_val.sum().backward()
        true_grads = x_np.grad

        # Pytorch kernel
        device = torch.device('cuda')
        if hasattr(cuda, 'external_stream'):
            stream = cuda.external_stream(
                torch.cuda.current_stream(device).cuda_stream)
        else:
            stream = cuda.default_stream()

        x_c = torch.tensor(x, device=device, dtype=torch.float32)
        labels_c = torch.tensor(labels, device=device, dtype=torch.int32)

        # Allocate workspace memory
        denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        betas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        llForward = torch.zeros(B, device=device, dtype=x_c.dtype)
        llBackward = torch.zeros(B, device=device, dtype=x_c.dtype)
        input_lengths = torch.tensor([T], dtype=torch.int32, device=device)
        label_lengths = torch.tensor([len(labels[0])],
                                     dtype=torch.int32,
                                     device=device)

        # certify input data
        certify_inputs(x_c, labels_c, input_lengths, label_lengths)

        # flatten activation tensor (for pointer based indexing)
        x_c = x_c.view([-1])
        grads = torch.zeros_like(x_c, requires_grad=False)

        # call kernel
        # log softmax reduction
        reduce.reduce_max(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=False,
                          stream=stream)
        reduce.reduce_exp(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=True,
                          stream=stream)

        # alpha kernel
        gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0](
            x_c,
            denom,
            alphas,
            llForward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # beta kernel
        gpu_rnnt_kernel.compute_betas_kernel[B, U, stream, 0](
            x_c,
            denom,
            betas,
            llBackward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # gamma kernel
        grad_blocks_per_grid = B * T * U
        grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE
        gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid,
                                            grad_threads_per_block, stream, 0](
                                                grads,
                                                x_c,
                                                denom,
                                                alphas,
                                                betas,
                                                llForward,
                                                input_lengths,
                                                label_lengths,
                                                labels_c,
                                                B,
                                                T,
                                                U,
                                                V,
                                                blank_idx,
                                                fastemit_lambda,
                                                clamp,
                                            )

        # sync kernel
        stream.synchronize()

        # reshape grads
        grads = grads.view([B, T, U, V])
        diff = true_grads - grads[0].cpu().numpy()

        assert np.abs(diff).mean() <= 1e-5
        assert np.square(diff).mean() <= 1e-10