Example #1
0
def test_shape_func():
    if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist:
        return

    mod = tvm.IRModule()
    data_shape = (relay.Any(), )
    x = relay.var("x", shape=data_shape)
    y = relay.op.vm.shape_of(x)
    z = relay.nn.relu(y)
    p0 = relay.var("p0", shape=data_shape)
    fn = relay.Function([p0], z)
    out = relay.var("out", shape=(1, ), dtype="int64")
    ins = relay.Tuple([y])
    outs = relay.Tuple([out])
    is_inputs = [False]
    shape_func = relay.op.vm.shape_func(fn, ins, outs, is_inputs)
    mod["main"] = relay.Function([x, out], shape_func)
    ca = context_analysis(mod, tvm.cuda())
    main = mod["main"]

    cpu_dev = tvm.cpu().device_type
    gpu_dev = tvm.cuda().device_type
    assert main.params[0] in ca and ca[main.params[0]][0].value == gpu_dev
    # The output of shape func should be on cpu.
    assert main.params[1] in ca and ca[main.params[1]][0].value == cpu_dev
    # shape func is the body and it should be on cpu
    assert main.body in ca and ca[main.body][0].value == cpu_dev
Example #2
0
    def run_test(tvm_intrin, np_func, dtype):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return
        # set of intrinsics does not support fp16 yet.
        skip_set = {
            tvm.tir.abs,
            tvm.tir.round,
            tvm.tir.tan,
            tvm.tir.atan,
            tvm.tir.tanh,
            tvm.tir.cosh,
            tvm.tir.sinh,
        }
        if dtype == "float16" and tvm_intrin in skip_set:
            print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__))
            return

        n = 128
        A = te.placeholder((n,), dtype=dtype, name="A")
        B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B")
        s = sched(B)
        f = tvm.build(s, [A, B], "cuda")
        dev = tvm.cuda(0)
        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
        f(a, b)
        tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
Example #3
0
    def check_cuda(dtype, n, l, padding, lanes):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        dev = tvm.cuda(0)
        A = tvm.te.placeholder((n, l), name="A", dtype=dtype)
        B = tvm.te.compute(
            (n // lanes, l + 2 * padding, lanes),
            lambda i, j, k: tvm.te.if_then_else(
                tvm.te.any(j < padding, j >= l + padding),
                tvm.runtime.convert(0).astype(dtype),
                A[i * lanes + k, j - padding],
            ),
            name="B",
        )
        s = te.create_schedule(B.op)
        block, thread, vectorize = s[B].op.axis
        s[B].bind(block, bx)
        s[B].bind(thread, tx)
        s[B].vectorize(vectorize)
        fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad")
        np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype)
        a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a)
        b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
        fun(a, b)
        np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1)
        ref = np.pad(
            np_a_reshape, ((0, 0), (padding, padding), (0, 0)), mode="constant", constant_values=0
        )
        tvm.testing.assert_allclose(b.numpy(), ref)
Example #4
0
def test_multi_targets():
    # Build an IRModule.
    n = 10
    x = relay.var("x", shape=(n,))
    y = relay.var("y", shape=(n,))
    z = relay.var("z", shape=(n,))
    f = relay.Function([x, y, z], x + relay.op.annotation.on_device(y + z, tvm.cpu()))
    mod = IRModule.from_expr(f)

    # Compile to VMExecutable.
    with tvm.transform.PassContext(
        opt_level=3, config={"relay.fallback_device_type": tvm.cuda().device_type}
    ):
        exe = relay.vm.compile(
            mod, target={"cpu": tvm.target.Target("llvm"), "cuda": tvm.target.Target("cuda")}
        )

    # Run
    vm = runtime.vm.VirtualMachine(exe, [tvm.cuda(), tvm.cpu()])
    x_data = np.random.rand(
        n,
    ).astype("float32")
    y_data = np.random.rand(
        n,
    ).astype("float32")
    z_data = np.random.rand(
        n,
    ).astype("float32")
    actual_result = vm.invoke("main", x_data, y_data, z_data)

    # Test
    expected_result = x_data + y_data + z_data
    tvm.testing.assert_allclose(actual_result.numpy(), expected_result)
Example #5
0
def test_alloc_storage():
    if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist:
        return

    mod = tvm.IRModule()
    mod.import_from_std("core.rly")
    size = relay.Var("size", relay.scalar_type("int64"))
    alignment = relay.Var("alignment", relay.scalar_type("int64"))
    # allocate a chunk on of memory on gpu.
    sto = relay.op.memory.alloc_storage(size, alignment, tvm.cuda())
    mod["main"] = relay.Function([size, alignment], sto)
    ca = context_analysis(mod, tvm.cuda())
    main = mod["main"]
    body = main.body

    cpu_dev = tvm.cpu().device_type
    gpu_dev = tvm.cuda().device_type
    # Inputs are unified with alloc storage inputs which are on cpu
    assert main.params[0] in ca and ca[main.params[0]][0].value == cpu_dev
    assert main.params[1] in ca and ca[main.params[1]][0].value == cpu_dev

    assert isinstance(body, relay.Call) and len(body.args) == 2
    # size of alloc_storage is on cpu
    assert body.args[0] in ca and ca[body.args[0]][0].value == cpu_dev
    # alignment of alloc_storage is on cpu
    assert body.args[1] in ca and ca[body.args[1]][0].value == cpu_dev
    # alloc_storage is on gpu as specified
    assert body in ca and ca[body][0].value == gpu_dev
Example #6
0
def test_alloc_tensor():
    if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist:
        return

    mod = tvm.IRModule()
    mod.import_from_std("core.rly")
    sto_type = relay.TypeCall(mod.get_global_type_var("Storage"), [])
    sto = relay.Var("x", sto_type)
    sh = relay.const(np.array([3, 2]), dtype="int64")
    at = relay.op.memory.alloc_tensor(sto, relay.const(0, dtype="int64"), sh)
    mod["main"] = relay.Function([sto], at)
    ca = context_analysis(mod, tvm.cuda())
    main = mod["main"]
    body = main.body

    cpu_dev = tvm.cpu().device_type
    gpu_dev = tvm.cuda().device_type
    # Input of the function falls back to the default device gpu
    assert main.params[0] in ca and ca[main.params[0]][0].value == gpu_dev

    assert isinstance(body, relay.Call) and len(body.args) == 3
    # storage of alloc_tensor falls back to the default device gpu
    assert body.args[0] in ca and ca[body.args[0]][0].value == gpu_dev
    # shape of alloc_tensor is on cpu
    assert body.args[1] in ca and ca[body.args[1]][0].value == cpu_dev
    # alloc_tensor keeps the same device context as storage which is is on gpu
    assert body in ca and ca[body][0].value == gpu_dev
Example #7
0
    def check(t0, t1, factor):
        if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        # compute
        n = 128
        A = te.placeholder((n,), dtype=t0, name="A")
        B = te.placeholder((n,), dtype=t1, name="B")
        C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C")

        # schedule
        s = tvm.te.create_schedule(C.op)
        ob, ib = s[C].split(s[C].op.axis[0], factor=factor)
        s[C].vectorize(ib)
        s[C].bind(ob, tx)
        func = tvm.build(s, [A, B, C], "cuda")

        # correctness
        dev = tvm.cuda(0)
        low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10)
        a_np = np.random.randint(low, high, size=n).astype(A.dtype)
        b_np = np.random.randint(low, high, size=n).astype(B.dtype)
        c_np = (a_np + b_np).astype(A.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        b_nd = tvm.nd.array(b_np, dev)
        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev)
        func(a_nd, b_nd, c_nd)
        tvm.testing.assert_allclose(c_nd.numpy(), c_np, rtol=1e-3)
Example #8
0
 def check_cuda(dtype, n, lanes):
     if dtype == "int8" and not have_int8(tvm.cuda(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
     B = te.placeholder((n,), name="B", dtype="%sx%d" % (dtype, lanes))
     C = te.placeholder((n,), name="C", dtype="int32")
     D = te.compute(
         (n,), lambda i: tvm.tir.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name="D"
     )
     s = te.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, bx)
     s[D].bind(xi, tx)
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n, lanes))
     np_c = np.random.randint(low=0, high=127, size=(n,))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     dev = tvm.cuda(0)
     a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np_b)
     c = tvm.nd.empty((n,), C.dtype, dev).copyfrom(np_c)
     d = tvm.nd.empty((n,), D.dtype, dev)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.numpy(), np_d)
Example #9
0
    def check_cuda(dtype):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        n, m = 16, 16
        A = te.placeholder(
            (
                n,
                m,
            ),
            name="A",
            dtype=dtype,
        )
        B = te.compute(
            (
                n,
                m,
            ),
            lambda j, i: A[j, (i + 1) % m],
            name="B",
        )

        cuda_target = tvm.target.Target("cuda")
        assert cuda_target.thread_warp_size == 2 * m
        with cuda_target:
            s = te.create_schedule(B.op)
            tx = te.thread_axis("threadIdx.x")
            ty = te.thread_axis("threadIdx.y")
            bx = te.thread_axis("blockIdx.x")

            AA = s.cache_read(A, "warp", [B])
            y, x = B.op.axis
            z, y = s[B].split(y, nparts=2)
            s[B].bind(x, tx)
            s[B].bind(y, ty)
            s[B].bind(z, bx)
            s[AA].compute_at(s[B], y)
            _, x = AA.op.axis
            s[AA].bind(x, tx)

            dev = tvm.cuda(0)
            func = tvm.build(s, [A, B], "cuda")
            A_np = np.array([list(range(i, m + i)) for i in range(n)],
                            dtype=dtype)
            B_np = np.array(
                [list(range(1 + i, m + i)) + [i] for i in range(n)],
                dtype=dtype)
            A_nd = tvm.nd.array(A_np, dev)
            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
            func(A_nd, B_nd)
            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
Example #10
0
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
    global TASK
    TASK = (
        "bcast_binary_"
        + typ
        + "_lhs"
        + "_".join([str(ele) for ele in lhs_shape])
        + "rhs"
        + "_".join([str(ele) for ele in rhs_shape])
    )
    A = te.placeholder(shape=lhs_shape, name="A")
    B = te.placeholder(shape=rhs_shape, name="B")
    if typ == "add":
        C = topi.broadcast_add(A, B)
    elif typ == "sub":
        C = topi.broadcast_sub(A, B)
    elif typ == "div":
        C = topi.broadcast_div(A, B)
    elif typ == "mul":
        C = topi.broadcast_mul(A, B)
    elif typ == "maximum":
        C = topi.broadcast_maximum(A, B)
    elif typ == "minimum":
        C = topi.broadcast_minimum(A, B)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_broadcast(C)
    fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ)

    lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
    rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
    if typ == "add":
        out_npy = lhs_npy + rhs_npy
    elif typ == "sub":
        out_npy = lhs_npy - rhs_npy
    elif typ == "div":
        rhs_npy = np.abs(rhs_npy) + 0.001
        out_npy = lhs_npy / rhs_npy
    elif typ == "mul":
        out_npy = lhs_npy * rhs_npy
    elif typ == "maximum":
        out_npy = np.maximum(lhs_npy, rhs_npy)
    elif typ == "minimum":
        out_npy = np.minimum(lhs_npy, rhs_npy)
    lhs_nd = tvm.nd.array(lhs_npy, tvm.cuda())
    rhs_nd = tvm.nd.array(rhs_npy, tvm.cuda())
    out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.cuda())
    for _ in range(2):
        fcuda(lhs_nd, rhs_nd, out_nd)
    tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
Example #11
0
def test_inject_async_copy_shared_dyn():
    f = ptx_global_to_shared_dyn_copy_fp16x8

    mod = tvm.IRModule.from_expr(f)
    mod = tvm.tir.transform.FlattenBuffer()(mod)
    mod = tvm.tir.transform.VectorizeLoop()(mod)
    mod = tvm.tir.transform.MergeDynamicSharedMemoryAllocations()(mod)
    mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod)

    assert count_cp_async(mod["main"].body) == 2

    if not tvm.testing.is_ampere_or_newer():
        return

    with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
        mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")

    A_np = np.random.rand(32, 128).astype("float16")
    B_np = np.random.rand(32, 128).astype("float16")
    C_np = np.zeros((32, 128)).astype("float16")
    dev = tvm.cuda(0)
    A_nd = tvm.nd.array(A_np, device=dev)
    B_nd = tvm.nd.array(B_np, device=dev)
    C_nd = tvm.nd.array(C_np, device=dev)
    mod(A_nd, B_nd, C_nd)
    tvm.testing.assert_allclose(C_nd.numpy(), A_np + B_np)
    def build_export_vm(device):
        """relay build & export graph"""
        x = relay.var("x", shape=(10, 5))
        y = relay.var("y", shape=(1, 5))
        z = relay.add(x, y)
        z = relay.exp(z)
        func = relay.Function([x, y], z)
        x_data = np.random.rand(10, 5).astype("float32")
        y_data = np.random.rand(1, 5).astype("float32")

        pt_device = torch.device(device)
        if pt_device.type == "cuda":
            target = "cuda"
            ctx = tvm.cuda(pt_device.index)
        else:
            target = "llvm"
            ctx = tvm.cpu(0)
        exe = relay.vm.compile(tvm.IRModule.from_expr(func),
                               target=target,
                               params={})
        code, lib = exe.save()
        export_dir = tempfile.mkdtemp("tvm_export")
        # export to tempdir
        lib.export_library(os.path.join(export_dir, TVM_ASSETS[0]))
        with open(os.path.join(export_dir, TVM_ASSETS[1]), "wb") as fout:
            fout.write(code)
        vm = tvm.runtime.vm.VirtualMachine(exe, ctx)
        res = vm.run(x_data, y_data)
        ref_res = np.exp(y_data + x_data)
        tvm.testing.assert_allclose(res.numpy(), ref_res, atol=1e-5, rtol=1e-5)
        return export_dir
Example #13
0
def test_gemm_mma_m8n8k4_row_row_fp16fp16fp32():
    sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_row_fp16fp16fp32)
    arch = tvm.contrib.nvcc.get_target_compute_version()
    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
    if major < 7:
        # Require at least SM70
        return
    cuda_mod = tvm.build(sch.mod, target="cuda")

    A_np = np.random.uniform(-1, 1, [16, 4]).astype("float16")
    B_np = np.random.uniform(-1, 1, [4, 16]).astype("float16")
    C_np = np.zeros([16, 16]).astype("float32")

    ctx = tvm.cuda()
    A_tvm = tvm.nd.array(A_np, ctx)
    B_tvm = tvm.nd.array(B_np, ctx)
    C_tvm = tvm.nd.array(C_np, ctx)

    cuda_mod(A_tvm, B_tvm, C_tvm)

    golden = np.matmul(A_np.astype("float32"), B_np.astype("float32"))

    C_numpy = C_tvm.numpy()

    tvm.testing.assert_allclose(golden, C_numpy, atol=1e-3, rtol=1e-3)
Example #14
0
def test_inject_async_copy():
    for dtype, vec_size in [("float16", 8), ("float16", 4), ("float32", 4),
                            ("float32", 1)]:
        if vec_size == 1:
            f = ptx_global_to_shared_copy_fp32x1
        else:
            f = generate_global_to_shared_vectorized_copy(dtype, vec_size)

        mod = tvm.IRModule.from_expr(f)
        mod = tvm.tir.transform.FlattenBuffer()(mod)
        if vec_size > 1:
            mod = tvm.tir.transform.VectorizeLoop()(mod)
        mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod)

        assert count_cp_async(mod["main"].body) == 1

        if not tvm.testing.is_ampere_or_newer():
            continue

        with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
            mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")

        A_np = np.random.rand(32, 128).astype(dtype)
        B_np = np.zeros((32, 128)).astype(dtype)
        dev = tvm.cuda(0)
        A_nd = tvm.nd.array(A_np, device=dev)
        B_nd = tvm.nd.array(B_np, device=dev)
        mod(A_nd, B_nd)
        tvm.testing.assert_allclose(B_nd.numpy(), A_np)
Example #15
0
def test_cuda_graph_executor():
    mod, params = relay.testing.synthetic.get_workload()
    with tvm.transform.PassContext(opt_level=3):
        complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")

    dev = tvm.cuda()
    try:
        gmod = complied_graph_lib["cuda_graph_create"](dev)
    except:
        print("Skip because cuda_graph not enabled")
        return
    set_input = gmod["set_input"]
    run = gmod["run"]
    get_output = gmod["get_output"]
    set_input("data", tvm.nd.array(data))
    run()
    out = get_output(0).numpy()
    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)

    # cuda graph executor wrapper
    cu_gmod = cuda_graph_executor.GraphModuleCudaGraph(gmod)
    cu_gmod.set_input("data", data)
    cu_gmod.run()
    out = cu_gmod.get_output(0).numpy()
    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
Example #16
0
def test_gemm_mma_m16n8k16_row_col_s8u8s32():
    sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_s8u8s32)
    arch = tvm.contrib.nvcc.get_target_compute_version()
    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
    if major < 8:
        # Require at least SM80
        return
    cuda_mod = tvm.build(sch.mod, target="cuda")
    cuda_mod = tvm.build(sch.mod, target="cuda")

    A_np = np.random.uniform(-10, 10, [16, 16]).astype("int8")
    B_np = np.random.uniform(-10, 10, [8, 16]).astype("uint8")
    C_np = np.zeros([16, 8]).astype("int32")

    ctx = tvm.cuda()
    A_tvm = tvm.nd.array(A_np, ctx)
    B_tvm = tvm.nd.array(B_np, ctx)
    C_tvm = tvm.nd.array(C_np, ctx)

    cuda_mod(A_tvm, B_tvm, C_tvm)

    golden = np.matmul(A_np.astype("int32"), B_np.astype("int32").T)

    C_numpy = C_tvm.numpy()

    tvm.testing.assert_allclose(golden, C_numpy, atol=1e-3, rtol=1e-3)
Example #17
0
def get_tvm_elementwise_output(
    graph,
    model_path,
    input1: flow.tensor,
    input2: flow.tensor,
    target="llvm",
    dtype="float32",
):
    """Generic function to execute and get tvm elementwise output"""
    input1_numpy = input1.numpy()
    input2_numpy = input2.numpy()
    if target == "llvm":
        device = tvm.cpu(0)
    elif target == "cuda":
        device = tvm.cuda(0)

    mod, params = relay.frontend.from_oneflow(graph, model_path)
    with tvm.transform.PassContext(opt_level=10):
        intrp = relay.build_module.create_executor("graph", mod, device,
                                                   target)
    tvm_output = intrp.evaluate()(
        tvm.nd.array(input1_numpy.astype(dtype)),
        tvm.nd.array(input2_numpy.astype(dtype)),
        **params,
    ).numpy()
    return tvm_output
Example #18
0
def test_fp16_build():
    dtype = "float16"

    dev = tvm.cuda(0)
    if dtype == "float16" and not have_fp16(dev.compute_version):
        print("skip because gpu does not support fp16")
        return

    x = relay.var("x", dtype=dtype, shape=(4, 4))
    y = relay.var("y", dtype=dtype, shape=(4, 4))
    z = x + y
    func = relay.Function([x, y], z)
    X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
    Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
    params = {
        "x": X,
        "y": Y,
    }

    # build
    g_json, mmod, params = relay.build(func, "cuda", params=params)

    # test
    rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
    rt.load_params(runtime.save_param_dict(params))
    rt.run()
    out = rt.get_output(0)

    np.testing.assert_allclose(out.numpy(), X.numpy() + Y.numpy(), atol=1e-5, rtol=1e-5)
Example #19
0
def test_gpu():
    mod, params = relay.testing.synthetic.get_workload()
    with relay.build_config(opt_level=3):
        complied_graph_lib = relay.build_module.build(mod,
                                                      "cuda",
                                                      params=params)
    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
    dev = tvm.cuda()

    # raw api
    gmod = complied_graph_lib["default"](dev)
    set_input = gmod["set_input"]
    run = gmod["run"]
    get_output = gmod["get_output"]
    set_input("data", tvm.nd.array(data))
    run()
    out = get_output(0).numpy()
    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)

    # graph executor wrapper
    gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev))
    gmod.set_input("data", data)
    gmod.run()
    out = gmod.get_output(0).numpy()
    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
Example #20
0
def eval_acc(model,
             dataset,
             batch_fn,
             target=tvm.target.cuda(),
             device=tvm.cuda(),
             log_interval=100):
    with tvm.transform.PassContext(opt_level=3):
        graph, lib, params = relay.build(model, target)
    # create runtime module
    m = tvm.contrib.graph_executor.create(graph, lib, device)
    m.set_input(**params)

    # setup evaluaiton metric
    dataset.reset()
    batch_size = dataset.batch_size
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)
    acc_top1.reset()
    acc_top5.reset()
    # Execute
    for i, batch in enumerate(dataset):
        data, label = batch_fn(batch, [mx.cpu(0)])
        m.run(data=data[0].asnumpy())
        out_arr = m.get_output(0)
        acc_top1.update(label, [mx.nd.array(out_arr.asnumpy())])
        acc_top5.update(label, [mx.nd.array(out_arr.asnumpy())])

        if not (i + 1) % log_interval:
            _, top1 = acc_top1.get()
            _, top5 = acc_top5.get()
            nsamples = (i + 1) * batch_size
            logging.info("[%d samples] validation: acc-top1=%f acc-top5=%f",
                         nsamples, top1, top5)
    logging.info("[final] validation: acc-top1=%f acc-top5=%f", top1, top5)
    return top1
Example #21
0
def test_bias_add():
    for dtype in ["float16", "float32"]:
        xshape = (10, 2, 3, 4)
        bshape = (2,)
        rtol = 1e-2 if dtype == "float16" else 1e-5
        x = relay.var("x", shape=xshape, dtype=dtype)
        bias = relay.var("bias", dtype=dtype)
        z = relay.nn.bias_add(x, bias)
        zz = run_infer_type(z)
        assert "axis=" not in zz.astext()
        assert zz.args[1].checked_type == relay.TensorType(bshape, dtype)

        func = relay.Function([x, bias], z)
        x_data = np.random.uniform(size=xshape).astype(dtype)
        y_data = np.random.uniform(size=bshape).astype(dtype)
        ref_res = x_data + y_data.reshape((2, 1, 1))
        for target, dev in tvm.testing.enabled_targets():
            if (
                dtype == "float16"
                and target == "cuda"
                and not have_fp16(tvm.cuda(0).compute_version)
            ):
                continue
            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
                x_data, y_data
            )
            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol)
Example #22
0
def test_vm_shape_of():
    if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist:
        return

    mod = tvm.IRModule()
    data_shape = (relay.Any(), )
    x = relay.var("x", shape=data_shape)
    y = relay.op.vm.shape_of(x)
    mod["main"] = relay.Function([x], y)
    ca = context_analysis(mod, tvm.cuda())
    main = mod["main"]

    cpu_dev = tvm.cpu().device_type
    gpu_dev = tvm.cuda().device_type
    assert main.params[0] in ca and ca[main.params[0]][0].value == gpu_dev
    assert main.body in ca and ca[main.body][0].value == cpu_dev
Example #23
0
def test_cuda_lib():
    dev = tvm.cuda(0)
    for device in ["llvm", "cuda"]:
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled..." % device)
            return
    nn = 12
    n = tvm.runtime.convert(nn)
    A = te.placeholder((n, ), name="A")
    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
    s = te.create_schedule(B.op)
    bx, tx = s[B].split(B.op.axis[0], factor=4)
    s[B].bind(bx, te.thread_axis("blockIdx.x"))
    s[B].bind(tx, te.thread_axis("threadIdx.x"))

    from tvm.contrib import utils

    temp = utils.tempdir()
    fn_add = tvm.build(s, [A, B], target="cuda --host=llvm", name="add")
    path_lib = temp.relpath("deploy_lib.so")
    fn_add.export_library(path_lib)
    m = tvm.runtime.load_module(path_lib)
    a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev)
    b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev)
    m["add"](a, b)
    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
Example #24
0
    def check_single_op(opfunc, ref, dtype):
        shape = (10, 4)
        dtype = dtype
        tp = relay.TensorType(shape)
        x = relay.var("x", tp, dtype=dtype)
        y = opfunc(x)
        # test printer
        assert ("{}(%x)".format(y.op.name)) in y.astext()
        # test type inference
        yy = run_infer_type(y)
        assert yy.checked_type == tp

        if ref is not None:
            data = np.random.rand(*shape).astype(dtype)
            ref_res = ref(data)
            func = relay.Function([x], y)
            for target, dev in tvm.testing.enabled_targets():
                # use graph by execuor default for testing, as we need
                # create function explicitly to avoid constant-folding.
                if (dtype == "float16" and target == "cuda"
                        and not have_fp16(tvm.cuda(0).compute_version)):
                    continue
                intrp = relay.create_executor("graph",
                                              device=dev,
                                              target=target)
                op_res = intrp.evaluate(func)(data)
                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
Example #25
0
def verify_batch_matmul(Ashape,
                        Bshape,
                        Cshape,
                        in_dtype,
                        out_dtype,
                        rtol=1e-5):
    A = te.placeholder(Ashape, name="A", dtype=in_dtype)
    B = te.placeholder(Bshape, name="B", dtype=in_dtype)
    C = cublas.batch_matmul(A, B, dtype=out_dtype)
    s = te.create_schedule(C.op)

    dev = tvm.cuda(0)
    f = tvm.build(s, [A, B, C], "cuda")

    if "int" in in_dtype:
        a = tvm.nd.array(
            np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
        b = tvm.nd.array(
            np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev)
    else:
        a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
        b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)

    c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev)
    f(a, b, c)
    tvm.testing.assert_allclose(
        c.numpy(),
        np.matmul(a.numpy().astype(C.dtype),
                  b.numpy().astype(C.dtype)).astype(C.dtype),
        rtol=rtol,
    )
Example #26
0
def skip_runtime_test():
    if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist:
        print("Skip because CUDA is not enabled.")
        return True
    if not tensorrt.is_tensorrt_runtime_enabled():
        print("Skip because TensorRT runtime is not available.")
        return True
    return False
Example #27
0
def test_concatenate():
    for dtype in ["float16", "float32"]:
        n, t, d = te.size_var("n"), te.size_var("t"), 100
        x = relay.var("x", shape=(n, t, d))
        y = relay.var("y", shape=(n, t, d))
        z = relay.concatenate((x, y), axis=-1)
        assert "axis=" in z.astext()
        zz = run_infer_type(z)
        assert zz.checked_type == relay.TensorType((n, t, 200))

        x = relay.exp(x)
        z = relay.concatenate((x, y), axis=2)
        zz = run_infer_type(z)
        assert zz.checked_type == relay.TensorType((n, t, 200))

        z = relay.concatenate((x, y), axis=1)
        zz = run_infer_type(z)
        assert zz.checked_type == relay.TensorType((n, t + t, 100))

        # check shape mismatches (the following case is expected to raise tvm._ffi.base.TVMError.
        try:
            x = relay.var("p1", shape=(2, 5))
            y = relay.var("p2", shape=(2, 3))
            c = relay.concatenate([x, y], axis=0)
            func = relay.Function([x, y], c)
            zz = run_infer_type(func)
        except tvm._ffi.base.TVMError:
            pass
        else:
            assert False

        x = relay.var("x", shape=(10, 5), dtype=dtype)
        y = relay.var("y", shape=(10, 5), dtype=dtype)
        t = relay.var("z", shape=(), dtype=dtype)
        z = relay.concatenate((x, y), axis=1)
        z = relay.add(z, t)
        # Check result.
        func = relay.Function([x, y, t], z)
        x_data = np.random.rand(10, 5).astype(dtype)
        y_data = np.random.rand(10, 5).astype(dtype)
        t_data = np.random.uniform(size=()).astype(dtype)
        ref_res = np.concatenate((x_data, y_data), axis=1) + t_data

        for target, dev in tvm.testing.enabled_targets():
            if (
                dtype == "float16"
                and target == "cuda"
                and not have_fp16(tvm.cuda(0).compute_version)
            ):
                continue
            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
                x_data, y_data, t_data
            )
            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01)
            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
                x_data, y_data, t_data
            )
            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01)
Example #28
0
def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
    in_channel = 4
    out_channel = 16
    filter_d = 3
    filter_h = 3
    filter_w = 3
    pad_d = 1
    pad_h = 1
    pad_w = 1
    stride_d = 1
    stride_h = 1
    stride_w = 1
    dilation_d = 1
    dilation_h = 1
    dilation_w = 1
    batch = 3
    depth = 32
    height = 32
    width = 32

    # schedule
    xshape = [batch, in_channel, depth, height, width]
    wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w]

    X = te.placeholder(xshape, name="X", dtype=data_dtype)
    W = te.placeholder(wshape, name="W", dtype=data_dtype)
    Y = cudnn.conv_forward(
        X,
        W,
        [pad_d, pad_h, pad_w],
        [stride_d, stride_h, stride_w],
        [dilation_d, dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=-1,
        conv_dtype=conv_dtype,
        groups=groups,
    )
    yshape = [x.value for x in Y.shape]
    s = te.create_schedule(Y.op)

    # validation
    dev = tvm.cuda(0)
    f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d")
    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
    y_np = np.zeros(yshape).astype(data_dtype)
    x = tvm.nd.array(x_np, dev)
    w = tvm.nd.array(w_np, dev)
    y = tvm.nd.array(y_np, dev)
    if tensor_format == 0:
        c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups)
    else:
        raise AssertionError(
            "For now, conv3d tensor format only support: 0(NCHW)")

    f(x, w, y)
    tvm.testing.assert_allclose(y.numpy(), c_np, atol=3e-5, rtol=1e-4)
Example #29
0
def test_cuda_tensor_core(model_name, input_shape):
    """Integration tests of auto tensorization with CUDA tensor core"""
    target = tvm.target.Target("nvidia/geforce-rtx-3070")
    dev = tvm.cuda()
    if model_name.startswith("bert"):
        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
    else:
        data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)

    mod, params, (input_name, _, _) = relay_workload.get_network(model_name, input_shape)
    seq = tvm.transform.Sequential(
        [
            relay.transform.ToMixedPrecision(),
        ]
    )

    with tvm.transform.PassContext(opt_level=3):
        mod = seq(mod)

    def convert_layout(mod):
        seq = tvm.transform.Sequential(
            [relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]})]
        )
        with tvm.transform.PassContext(opt_level=3):
            mod = seq(mod)
        return mod

    with tempfile.TemporaryDirectory() as work_dir:
        with ms.Profiler() as profiler:
            rt_mod1: tvm.runtime.Module = ms.tune_relay(
                mod=convert_layout(mod),
                params=params,
                target=target,
                config=ms.TuneConfig(
                    num_trials_per_iter=32,
                    max_trials_per_task=200,
                    max_trials_global=3000,
                ),
                sch_rules=ms.default_config._DefaultCUDATensorCore.schedule_rules,
                postprocs=ms.default_config._DefaultCUDATensorCore.postprocs,
                work_dir=work_dir,
            )
        print(profiler.table())

        # Compile without MetaSchedule for correctness check
        with tvm.transform.PassContext(opt_level=0):
            rt_mod2 = relay.build(mod, target=target, params=params)

        def get_output(data, lib):
            module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
            module.set_input(input_name, data)
            module.run()
            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, rt_mod1)
        expected_output = get_output(data, rt_mod2)
        assert np.allclose(actual_output, expected_output, rtol=1e-2, atol=2e-2)
Example #30
0
def verify_conv2d_backward_filter(data_dtype,
                                  conv_dtype,
                                  tensor_format=0,
                                  tol=1e-5):
    batch = 3
    in_channel = 4
    out_channel = 16
    filter_h, filter_w = 3, 3
    pad_h, pad_w = 1, 1
    stride_h, stride_w = 1, 1
    height, width = 32, 32

    if tensor_format == 0:
        x_shape = [batch, in_channel, height, width]
        dy_shape = [batch, out_channel, height, width]
    else:
        x_shape = [batch, height, width, in_channel]
        dy_shape = [batch, height, width, out_channel]

    x_np = np.random.uniform(-1, 1, x_shape).astype(data_dtype)
    dy_np = np.random.uniform(-1, 1, dy_shape).astype(data_dtype)

    dw_np = tvm.topi.testing.conv2d_backward_weight_python(
        dy_np,
        x_np,
        (filter_h, filter_w),
        (stride_h, stride_w),
        (pad_h, pad_w),
        "NCHW" if tensor_format == 0 else "NHWC",
    )

    x = te.placeholder(x_shape, name="x", dtype=data_dtype)
    dy = te.placeholder(dy_shape, name="dy", dtype=data_dtype)
    dw = cudnn.conv_backward_filter(
        dy,
        x,
        (filter_h, filter_w),
        [pad_h, pad_w],
        [stride_h, stride_w],
        [1, 1],
        conv_mode=1,
        tensor_format=tensor_format,
        conv_dtype=conv_dtype,
    )

    s = te.create_schedule(dw.op)

    dev = tvm.cuda(0)
    f = tvm.build(s, [dy, x, dw],
                  "cuda --host=llvm",
                  name="conv2d_backward_filter")

    x = tvm.nd.array(x_np, dev)
    dy = tvm.nd.array(dy_np, dev)
    dw = tvm.nd.array(dw_np, dev)

    f(dy, x, dw)
    tvm.testing.assert_allclose(dw.numpy(), dw_np, atol=tol, rtol=tol)