def test_body(M, N, L, K): if not torch.cuda.is_available(): return cuda_cg_executed = CudaCodeGenExecuted() cuda_cg_created = CudaCodeGenCreated() def test(x, y, z): v1 = torch.add(x, y) v2 = torch.add(v1, z) return v2 a_shape = [M, N] b_shape = [L, M, 1] c_shape = [K, L, 1, 1] traced = torch.jit.trace( test, ( torch.rand(*a_shape, device="cuda"), torch.rand(*b_shape, device="cuda"), torch.rand(*c_shape, device="cuda"), ), ) a = torch.rand(*a_shape, device="cuda") b = torch.rand(*b_shape, device="cuda") c = torch.rand(*c_shape, device="cuda") x = traced(a, b, c) npr = a.cpu().numpy() + b.cpu().numpy() + c.cpu().numpy() np.testing.assert_allclose(npr, x.cpu().numpy()) assert cuda_cg_executed.elapsed_value() >= 1 assert cuda_cg_created.elapsed_value() >= 1
def test_three_arg_cuda(self): if not torch.cuda.is_available(): return cuda_cg_executed = CudaCodeGenExecuted() cuda_cg_created = CudaCodeGenCreated() def test(x, y, z): aaa = torch.add(x, y) bbb = torch.add(aaa, z) return bbb M = 32 N = 32 traced = torch.jit.trace( test, ( torch.rand(M, N, device="cuda"), torch.rand(M, N, device="cuda"), torch.rand(M, N, device="cuda"), ), ) a = torch.rand(M, N, device="cuda") b = torch.rand(M, N, device="cuda") c = torch.rand(M, N, device="cuda") x = traced(a, b, c) npr = a.cpu().numpy() + b.cpu().numpy() + c.cpu().numpy() np.testing.assert_allclose(npr, x.cpu().numpy()) assert cuda_cg_executed.elapsed_value() >= 1 assert cuda_cg_created.elapsed_value() >= 1
def test_dynamic_shape(self): with num_profiled_runs(2): @torch.jit.script def test(x, y, z): return x * y * z cuda = CudaCodeGenCreated() x, y, z = [torch.rand(4, 8).cuda() for _ in range(3)] ref = test(x, y, z) _ = test(*[torch.rand(6, 8).cuda() for _ in range(3)]) res = test(x, y, z) np.testing.assert_allclose(ref.cpu().numpy(), res.cpu().numpy()) assert cuda.elapsed_value() == 1 # A wild broadcast appears. x = torch.rand(4, 8).cuda() y = torch.rand(1, 8).cuda() z = torch.rand(4, 1).cuda() res = test(x, y, z) xn, yn, zn = [t.cpu().numpy() for t in (x, y, z)] np.testing.assert_allclose(res.cpu().numpy(), xn * yn * zn) assert cuda.elapsed_value() == 1 # Mismatched shapes shouldn't reach codegen. x = torch.rand(4, 8).cuda() y = torch.rand(4, 8).cuda() z = torch.rand(5, 8).cuda() try: res = test(x, y, z) except RuntimeError as e: assert "The size of tensor a (4) must match" in e.args[0] assert cuda.elapsed_value() == 1