def test_body(M, N, L, K):
            if not torch.cuda.is_available():
                return
            cuda_cg_executed = CudaCodeGenExecuted()
            cuda_cg_created = CudaCodeGenCreated()

            def test(x, y, z):
                v1 = torch.add(x, y)
                v2 = torch.add(v1, z)
                return v2

            a_shape = [M, N]
            b_shape = [L, M, 1]
            c_shape = [K, L, 1, 1]
            traced = torch.jit.trace(
                test,
                (
                    torch.rand(*a_shape, device="cuda"),
                    torch.rand(*b_shape, device="cuda"),
                    torch.rand(*c_shape, device="cuda"),
                ),
            )

            a = torch.rand(*a_shape, device="cuda")
            b = torch.rand(*b_shape, device="cuda")
            c = torch.rand(*c_shape, device="cuda")
            x = traced(a, b, c)
            npr = a.cpu().numpy() + b.cpu().numpy() + c.cpu().numpy()
            np.testing.assert_allclose(npr, x.cpu().numpy())
            assert cuda_cg_executed.elapsed_value() >= 1
            assert cuda_cg_created.elapsed_value() >= 1
    def test_three_arg_cuda(self):
        if not torch.cuda.is_available():
            return
        cuda_cg_executed = CudaCodeGenExecuted()
        cuda_cg_created = CudaCodeGenCreated()

        def test(x, y, z):
            aaa = torch.add(x, y)
            bbb = torch.add(aaa, z)
            return bbb

        M = 32
        N = 32
        traced = torch.jit.trace(
            test,
            (
                torch.rand(M, N, device="cuda"),
                torch.rand(M, N, device="cuda"),
                torch.rand(M, N, device="cuda"),
            ),
        )

        a = torch.rand(M, N, device="cuda")
        b = torch.rand(M, N, device="cuda")
        c = torch.rand(M, N, device="cuda")
        x = traced(a, b, c)
        npr = a.cpu().numpy() + b.cpu().numpy() + c.cpu().numpy()
        np.testing.assert_allclose(npr, x.cpu().numpy())
        assert cuda_cg_executed.elapsed_value() >= 1
        assert cuda_cg_created.elapsed_value() >= 1
    def test_dynamic_shape(self):
        with num_profiled_runs(2):

            @torch.jit.script
            def test(x, y, z):
                return x * y * z

            cuda = CudaCodeGenCreated()
            x, y, z = [torch.rand(4, 8).cuda() for _ in range(3)]
            ref = test(x, y, z)
            _ = test(*[torch.rand(6, 8).cuda() for _ in range(3)])
            res = test(x, y, z)
            np.testing.assert_allclose(ref.cpu().numpy(), res.cpu().numpy())
            assert cuda.elapsed_value() == 1

            # A wild broadcast appears.
            x = torch.rand(4, 8).cuda()
            y = torch.rand(1, 8).cuda()
            z = torch.rand(4, 1).cuda()
            res = test(x, y, z)
            xn, yn, zn = [t.cpu().numpy() for t in (x, y, z)]
            np.testing.assert_allclose(res.cpu().numpy(), xn * yn * zn)
            assert cuda.elapsed_value() == 1

            # Mismatched shapes shouldn't reach codegen.
            x = torch.rand(4, 8).cuda()
            y = torch.rand(4, 8).cuda()
            z = torch.rand(5, 8).cuda()
            try:
                res = test(x, y, z)
            except RuntimeError as e:
                assert "The size of tensor a (4) must match" in e.args[0]
            assert cuda.elapsed_value() == 1