def check_cuda(dtype, n, lanes):
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = te.placeholder((n, ), name="A", dtype="%sx%d" % (dtype, lanes))
     B = te.placeholder((n, ), name="B", dtype="%sx%d" % (dtype, lanes))
     C = te.placeholder((n, ), name="C", dtype="int32")
     D = te.compute((n, ),
                    lambda i: tvm.tir.call_pure_extern(
                        "int32", "__dp4a", A[i], B[i], C[i]),
                    name="D")
     s = te.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, bx)
     s[D].bind(xi, tx)
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n, lanes))
     np_c = np.random.randint(low=0, high=127, size=(n, ))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     dev = tvm.gpu(0)
     a = tvm.nd.empty((n, ), A.dtype, dev).copyfrom(np_a)
     b = tvm.nd.empty((n, ), B.dtype, dev).copyfrom(np_b)
     c = tvm.nd.empty((n, ), C.dtype, dev).copyfrom(np_c)
     d = tvm.nd.empty((n, ), D.dtype, dev)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), np_d)
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
         print("skip because gpu does not support fp16")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n, ),
                     lambda i: A[i] + tvm.const(1, A.dtype),
                     name='B')
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, bx)
     s[B].bind(xi, tx)
     fun = tvm.build(s, [A, B], "cuda")
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n, ), A.dtype,
                      ctx).copyfrom(np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n, ), B.dtype, ctx)
     fun(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
Example #3
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
     C = tvm.placeholder((n,), name='C', dtype="int32")        
     D = tvm.compute((n,),
                     lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
     s = tvm.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_c = np.random.randint(low=0, high=127, size=(n,))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
     c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
     d = tvm.nd.empty((n,), D.dtype, ctx)
     fun(a, b, c, d)
     np.testing.assert_allclose(d.asnumpy(), np_d)
Example #4
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
     C = tvm.placeholder((n,), name='C', dtype="int32")
     D = tvm.compute((n,),
                     lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
     s = tvm.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_c = np.random.randint(low=0, high=127, size=(n,))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
     c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
     d = tvm.nd.empty((n,), D.dtype, ctx)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), np_d)
Example #5
0
 def check_cuda(dtype, n, lanes):
     if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
         print("Skip because gpu does not have fp16 support")
         return
     if dtype == "int8" and not have_int8(tvm.cuda(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
     B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
     s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, bx)
     s[B].bind(xi, tx)
     fun = tvm.build(s, [A, B], "cuda")
     dev = tvm.cuda(0)
     a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n,), B.dtype, dev)
     fun(a, c)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
Example #6
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
         print("skip because gpu does not support fp16")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B], "cuda")
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
         np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n,), B.dtype, ctx)
     fun(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)