def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None): target = tvm.target.current_target() if "cblas" in target.libs: C = cblas.matmul(data, weight, False, True) else: if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_x", out_dim, num_outputs=2) cfg.define_split("tile_y", batch, num_outputs=2) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_nopack_config(cfg, batch, out_dim, in_dim) vec = cfg["tile_k"].size[-1] k = tvm.reduce_axis((0, in_dim // vec), "k") CC = tvm.compute( (batch, out_dim, vec), lambda z, y, x: tvm.sum(data[z, k * vec + x].astype(out_dtype) * weight[y, k * vec + x].astype(out_dtype), axis=k)) kk = tvm.reduce_axis((0, vec), "kk") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), tag="dense_nopack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def verify_matmul_add(m, l, n, transa=False, transb=False, dtype=tvm.float32): bias = tvm.var('bias', dtype=dtype) ashape = (l, n) if transa else (n, l) bshape = (m, l) if transb else (l, m) A = tvm.placeholder(ashape, name='A', dtype=dtype) B = tvm.placeholder(bshape, name='B', dtype=dtype) C = cblas.matmul(A, B, transa, transb) D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") s = tvm.create_schedule(D.op) def get_numpy(a, b, bb, transa, transb): if transa: a = a.transpose() if transb: b = b.transpose() return np.dot(a, b) + bb def verify(target="llvm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.cblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose( d.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), bb, transa, transb), rtol=1e-5) verify()
def test_matmul_add(): n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = cblas.matmul(A, B) D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") s = tvm.create_schedule(D.op) def verify(target="llvm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.cblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) verify()
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None): target = tvm.target.current_target() if "cblas" in target.libs: C = cblas.matmul(data, weight, False, True) else: if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_y", batch, num_outputs=3) cfg.define_split("tile_x", out_dim, num_outputs=3) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_pack_config(cfg, batch, out_dim, in_dim) packw_bn = cfg["tile_x"].size[-1] packw_shape = (out_dim // packw_bn, in_dim, packw_bn) packw = tvm.compute(packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") k = tvm.reduce_axis((0, in_dim), name="k") C = tvm.compute( (batch, out_dim), lambda y, x: tvm.sum(data[y, k].astype(out_dtype) * packw[ x // packw_bn, k, x % packw_bn].astype(out_dtype), axis=k), tag="dense_pack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def test_matmul_add(): n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = cblas.matmul(A, B) D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") s = tvm.create_schedule(D.op) def verify(target="llvm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.cblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose( d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) verify()
def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): """Compute dense using cblas library""" M, K = get_const_tuple(data.shape) N, _ = get_const_tuple(weight.shape) cfg.add_flop(M * K * N * 2) C = cblas.matmul(data, weight, False, True) if bias is not None: C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None): target = tvm.target.current_target() if "cblas" in target.libs: C = cblas.matmul(data, weight, False, True) if bias is not None: C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C M, _ = get_const_tuple(data.shape) # For small batch sizes, don't pack weight into cache-friendly layout # because of overhead in packing and limited reuse from batch dimension # TODO(icemelon9): use a more systematic way to determine which schedule to use if M <= 16: return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype) return _declaration_dense_pack(cfg, data, weight, bias, out_dtype)
def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): """Compute dense using cblas library""" M, K = get_const_tuple(data.shape) N, _ = get_const_tuple(weight.shape) cfg.add_flop(M * K * N * 2) if data.dtype == 'uint8' and weight.dtype == 'int8' and out_dtype == 'int32': C = cblas.matmul_u8s8s32(data, weight, False, True, dtype=out_dtype) elif data.dtype == 'float32': C = cblas.matmul(data, weight, False, True) else: raise NotImplementedError(f"Dense with cblas for {data.dtype} is not supported") if bias is not None: C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None): target = tvm.target.current_target() if "cblas" in target.libs: C = cblas.matmul(data, weight, False, True) if bias is not None: C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j], tag=tag.BROADCAST) return C M, _ = get_const_tuple(data.shape) # Always use dense_nopack for dynamic input. # This is a temporary for CV models. # TODO(kevinthesun): use kernel dispatcher instead. if isinstance(M, tvm.expr.Var): return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype) # For small batch sizes, don't pack weight into cache-friendly layout # because of overhead in packing and limited reuse from batch dimension # TODO(icemelon9): use a more systematic way to determine which schedule to use if M <= 16: return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype) return _declaration_dense_pack(cfg, data, weight, bias, out_dtype)
d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5) ###################################################################### # Extern Contrib Wrappers # ----------------------- # TVM also provide extern contrib wrappers to useful extern calls, # the following line is equivalent to the previous example. # from tvm.contrib import cblas C = cblas.matmul(A, B) D = te.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") s = te.create_schedule(D.op) ###################################################################### # Hook Python Function as Extern # ------------------------------ # Since we can call into any PackedFunc in TVM. We can use the extern # function to callback into python. # # The following example registers a python function into TVM runtime system # and use it to complete one stage of the computation. # This makes TVM much more flexible. For example, we can insert front-end # callbacks to inspect the intermediate results or mix customized code # with TVM.
a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose( d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5) ###################################################################### # Extern Contrib Wrappers # ----------------------- # TVM also provide extern contrib wrappers to useful extern calls, # the following line is equivalent to the previous example. # from tvm.contrib import cblas C = cblas.matmul(A, B) D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Hook Python Function as Extern # ------------------------------ # Since we can call into any PackedFunc in TVM. We can use the extern # function to callback into python. # # The following example registers a python function into TVM runtime system # and use it to complete one stage of the computation. # This makes TVM much more flexible. For example, we can insert front-end # callbacks to inspect the intermediate results or mix customized code # with TVM. #