def test_matmul_add(): n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C1 = mps.matmul(A, B) C2 = mps.matmul(B, A, True, True) D1 = tvm.compute(C1.shape, lambda i, j: C1[i, j] + bias, name="D1") D2 = tvm.compute(C2.shape, lambda i, j: C2[i, j] + bias, name="D2") s1 = tvm.create_schedule(D1.op) s2 = tvm.create_schedule(D2.op) def verify(A, B, D, s, bias, target="llvm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not avalable") return ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) np.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) verify(A, B, D1, s1, bias) verify(A, B, D2, s2, bias)
def test_matmul(): n = 1024 l = 128 m = 256 A = te.placeholder((n, l), name="A") B = te.placeholder((l, m), name="B") C = mps.matmul(A, B) D = te.compute(C.shape, lambda *i: C(*i) + 1.0) s = te.create_schedule(D.op) yo, xo = D.op.axis block_y = te.thread_axis("blockIdx.y") block_x = te.thread_axis("blockIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_x = te.thread_axis("threadIdx.x") by, ty = s[D].split(yo, factor=16) bx, tx = s[D].split(xo, factor=16) s[D].bind(by, block_y) s[D].bind(bx, block_x) s[D].bind(ty, thread_y) s[D].bind(tx, thread_x) def verify(A, B, D, s, target="metal"): if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not available") return ctx = tvm.metal(0) f = tvm.build(s, [A, B, D], "metal") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5) verify(A, B, D, s)
def test_matmul(): if not tvm.module.enabled("metal"): print("skip because %s is not enabled..." % "metal") return n = 1024 l = 128 m = 256 A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = mps.matmul(A, B) D = tvm.compute( C.shape, lambda *i: C(*i) + 1. ) s = tvm.create_schedule(D.op) yo, xo = D.op.axis block_y = tvm.thread_axis("blockIdx.y") block_x = tvm.thread_axis("blockIdx.x") thread_y = tvm.thread_axis("threadIdx.y") thread_x = tvm.thread_axis("threadIdx.x") by, ty = s[D].split(yo, factor=16) bx, tx = s[D].split(xo, factor=16) s[D].bind(by, block_y) s[D].bind(bx, block_x) s[D].bind(ty, thread_y) s[D].bind(tx, thread_x) def verify(A, B, D, s, target="metal"): if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not available") return ctx = tvm.metal(0) f = tvm.build(s, [A, B, D], "metal") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5) verify(A, B, D, s)