def check_cuda(dtype, n, lanes): if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = te.placeholder((n, ), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.placeholder((n, ), name="B", dtype="%sx%d" % (dtype, lanes)) C = te.placeholder((n, ), name="C", dtype="int32") D = te.compute((n, ), lambda i: tvm.tir.call_pure_extern( "int32", "__dp4a", A[i], B[i], C[i]), name="D") s = te.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, bx) s[D].bind(xi, tx) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n, lanes)) np_b = np.random.randint(low=-128, high=127, size=(n, lanes)) np_c = np.random.randint(low=0, high=127, size=(n, )) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] dev = tvm.gpu(0) a = tvm.nd.empty((n, ), A.dtype, dev).copyfrom(np_a) b = tvm.nd.empty((n, ), B.dtype, dev).copyfrom(np_b) c = tvm.nd.empty((n, ), C.dtype, dev).copyfrom(np_c) d = tvm.nd.empty((n, ), D.dtype, dev) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("skip because gpu does not support fp16") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n, ), lambda i: A[i] + tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "cuda") ctx = tvm.gpu(0) a = tvm.nd.empty((n, ), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n, ), B.dtype, ctx) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes)) C = tvm.placeholder((n,), name='C', dtype="int32") D = tvm.compute((n,), lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D') s = tvm.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("blockIdx.x")) s[D].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n,lanes)) np_b = np.random.randint(low=-128, high=127, size=(n,lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b) c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c) d = tvm.nd.empty((n,), D.dtype, ctx) fun(a, b, c, d) np.testing.assert_allclose(d.asnumpy(), np_d)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes)) C = tvm.placeholder((n,), name='C', dtype="int32") D = tvm.compute((n,), lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D') s = tvm.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("blockIdx.x")) s[D].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n,lanes)) np_b = np.random.randint(low=-128, high=127, size=(n,lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b) c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c) d = tvm.nd.empty((n,), D.dtype, ctx) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d)
def check_cuda(dtype, n, lanes): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return if dtype == "int8" and not have_int8(tvm.cuda(0).compute_version): print("skip because gpu does not support int8") return A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "cuda") dev = tvm.cuda(0) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("skip because gpu does not support fp16") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B], "cuda") ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, ctx) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)