Python reduce_expの例

プログラミング言語: Python

名前空間/パッケージ名: nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils.reduce

メソッド/関数: reduce_exp

hotexamples.comのコード掲載数: 4

Python reduce_exp - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのnemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils.reduce.reduce_expの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

    def test_reduce_exp(self):
        random = np.random.RandomState(0)
        original_shape = [1, 5, 4, 2]
        x = random.randn(*original_shape).reshape([-1])
        dx = np.zeros_like(x)

        stream = cuda.stream()
        x_c = cuda.to_device(x, stream=stream)
        dx_c = cuda.to_device(dx, stream=stream)

        # call kernel
        cols = np.prod(original_shape[:3])
        reduce.reduce_exp(x_c,
                          dx_c,
                          rows=original_shape[-1],
                          cols=cols,
                          minus=False,
                          stream=stream)

        # sync kernel
        stream.synchronize()

        dx_result = dx_c.copy_to_host(stream=stream)
        del x_c, dx_c

        # collect results in first [B * T * U] values; for all V
        assert (dx_result[cols:] - dx[cols:]).sum() <= 1e-7

        # make sure dx_result updates the [B * T * U] values
        assert np.abs(dx_result[:cols] - dx[:cols]).sum() > 0

コード例 #2

ファイルを表示

    def log_softmax(self, acts: torch.Tensor, denom: torch.Tensor):
        """
        Computes the log softmax denominator of the input activation tensor
        and stores the result in denom.

        Args:
            acts: Activation tensor of shape [B, T, U, V+1]. The input must be represented as a flat tensor
                of shape [B * T * U * (V+1)] to allow pointer indexing.
            denom: A zero tensor of same shape as acts.

        Updates:
            This kernel inplace updates the `denom` tensor
        """
        # // trans_acts + pred_acts -> log_softmax denominator
        reduce.reduce_max(
            acts,
            denom,
            rows=self.alphabet_size_,
            cols=self.minibatch_ * self.maxT_ * self.maxU_,
            minus=False,
            stream=self.stream_,
        )

        reduce.reduce_exp(
            acts,
            denom,
            rows=self.alphabet_size_,
            cols=self.minibatch_ * self.maxT_ * self.maxU_,
            minus=True,
            stream=self.stream_,
        )

コード例 #3

ファイルを表示

    def test_compute_alphas_kernel(self):
        numba_utils.skip_numba_cuda_test_if_unsupported(
            __NUMBA_MINIMUM_VERSION__)

        random = np.random.RandomState(0)
        original_shape = [1, 5, 11, 3]
        B, T, U, V = original_shape

        # Numpy kernel
        x = random.randn(*original_shape)
        labels = np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]])  # [1, 10]
        label_len = len(labels[0]) + 1
        blank_idx = 0

        x_np = log_softmax(x, axis=-1)
        ground_alphas, ground_log_likelihood = rnnt_numpy.forward_pass(
            x_np[0, :, :label_len, :], labels[0, :label_len - 1], blank_idx)

        # Pytorch kernel
        device = torch.device('cuda')
        if hasattr(cuda, 'external_stream'):
            stream = cuda.external_stream(
                torch.cuda.current_stream(device).cuda_stream)
        else:
            stream = cuda.default_stream()

        x_c = torch.tensor(x, device=device, dtype=torch.float32)
        labels_c = torch.tensor(labels, device=device, dtype=torch.int32)

        # Allocate workspace memory
        denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        llForward = torch.zeros(B, device=device, dtype=x_c.dtype)
        input_lengths = torch.tensor([T], dtype=torch.int32, device=device)
        label_lengths = torch.tensor([len(labels[0])],
                                     dtype=torch.int32,
                                     device=device)

        # certify input data
        certify_inputs(x_c, labels_c, input_lengths, label_lengths)

        # flatten activation tensor (for pointer based indexing)
        x_c = x_c.view([-1])

        # call kernel
        # log softmax reduction
        reduce.reduce_max(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=False,
                          stream=stream)
        reduce.reduce_exp(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=True,
                          stream=stream)

        # alpha kernel
        gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0](
            x_c,
            denom,
            alphas,
            llForward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # sync kernel
        stream.synchronize()

        # reshape alphas
        alphas = alphas.view([B, T, U])
        diff = ground_alphas - alphas[0].cpu().numpy()

        assert np.abs(diff).mean() <= 1e-5
        assert np.square(diff).mean() <= 1e-10

        ll_diff = ground_log_likelihood - llForward[0].cpu().numpy()

        assert np.abs(ll_diff).mean() <= 1e-5
        assert np.square(ll_diff).mean() <= 1e-10

コード例 #4

ファイルを表示

    def test_compute_grads_kernel_clamp(self):
        numba_utils.skip_numba_cuda_test_if_unsupported(
            __NUMBA_MINIMUM_VERSION__)

        fastemit_lambda = 0.0
        clamp = 0.1

        random = np.random.RandomState(0)
        original_shape = [1, 5, 11, 3]
        B, T, U, V = original_shape

        # Numpy kernel
        x = random.randn(*original_shape)
        labels = torch.from_numpy(
            np.array([[1, 1, 1, 2, 2, 2, 1, 2, 2, 1]],
                     dtype=np.int32))  # [1, 10]
        audio_len = torch.from_numpy(np.array([T], dtype=np.int32))
        label_len = torch.from_numpy(np.array([U - 1], dtype=np.int32))
        blank_idx = 0

        x_np = torch.from_numpy(x)
        x_np.requires_grad_(True)
        """
        Here we will directly utilize the numpy variant of the loss without explicitly calling
        the numpy functions for alpha, beta and grads. 

        This is because the grads returned by the rnnt_numpy.transduce_batch() are :
        d/dx (alpha + beta alignment)(log_softmax(x)). 
        But according to the chain rule, we'd still need to compute the gradient of log_softmax(x)
        and update the alignments by hand. Instead, we will rely on pytorch to compute the gradient 
        of the log_softmax(x) step and propagate it backwards. 
        """
        loss_func = rnnt_numpy.RNNTLoss(blank_idx,
                                        fastemit_lambda=fastemit_lambda,
                                        clamp=clamp)
        loss_val = loss_func(x_np, labels, audio_len, label_len)
        loss_val.sum().backward()
        true_grads = x_np.grad

        # Pytorch kernel
        device = torch.device('cuda')
        if hasattr(cuda, 'external_stream'):
            stream = cuda.external_stream(
                torch.cuda.current_stream(device).cuda_stream)
        else:
            stream = cuda.default_stream()

        x_c = torch.tensor(x, device=device, dtype=torch.float32)
        labels_c = torch.tensor(labels, device=device, dtype=torch.int32)

        # Allocate workspace memory
        denom = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        alphas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        betas = torch.zeros(B * T * U, device=device, dtype=x_c.dtype)
        llForward = torch.zeros(B, device=device, dtype=x_c.dtype)
        llBackward = torch.zeros(B, device=device, dtype=x_c.dtype)
        input_lengths = torch.tensor([T], dtype=torch.int32, device=device)
        label_lengths = torch.tensor([len(labels[0])],
                                     dtype=torch.int32,
                                     device=device)

        # certify input data
        certify_inputs(x_c, labels_c, input_lengths, label_lengths)

        # flatten activation tensor (for pointer based indexing)
        x_c = x_c.view([-1])
        grads = torch.zeros_like(x_c, requires_grad=False)

        # call kernel
        # log softmax reduction
        reduce.reduce_max(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=False,
                          stream=stream)
        reduce.reduce_exp(x_c,
                          denom,
                          rows=V,
                          cols=B * T * U,
                          minus=True,
                          stream=stream)

        # alpha kernel
        gpu_rnnt_kernel.compute_alphas_kernel[B, U, stream, 0](
            x_c,
            denom,
            alphas,
            llForward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # beta kernel
        gpu_rnnt_kernel.compute_betas_kernel[B, U, stream, 0](
            x_c,
            denom,
            betas,
            llBackward,
            input_lengths,
            label_lengths,
            labels_c,
            B,
            T,
            U,
            V,
            blank_idx,
        )

        # gamma kernel
        grad_blocks_per_grid = B * T * U
        grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE
        gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid,
                                            grad_threads_per_block, stream, 0](
                                                grads,
                                                x_c,
                                                denom,
                                                alphas,
                                                betas,
                                                llForward,
                                                input_lengths,
                                                label_lengths,
                                                labels_c,
                                                B,
                                                T,
                                                U,
                                                V,
                                                blank_idx,
                                                fastemit_lambda,
                                                clamp,
                                            )

        # sync kernel
        stream.synchronize()

        # reshape grads
        grads = grads.view([B, T, U, V])
        diff = true_grads - grads[0].cpu().numpy()

        assert np.abs(diff).mean() <= 1e-5
        assert np.square(diff).mean() <= 1e-10