def test_multiple_kernels(): N = 1024 A = tvm.placeholder((N, N), name='A') B = tvm.compute((N, N), lambda i, j: A[i, j]) C = tvm.compute((N, N), lambda i, j: B[i, j]) s = tvm.create_schedule([C.op]) s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x")) s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))]}): tvm.build(s, [A, C], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))]}): tvm.build(s, [A, C], target) assert valid[0]
def test_local_memory(): N = 1024 M = 128 A = tvm.placeholder((N,), name='A', dtype='float32') B = tvm.compute((N, ), lambda i: A[i], name='B') s = tvm.create_schedule([B.op]) AA = s.cache_read(A, "local", [B]) o, i = s[B].split(s[B].op.axis[0], M) s[AA].compute_at(s[B], o) s[B].bind(o, tvm.thread_axis("blockIdx.x")) # local memory usage: M * 4B # thread usage: M for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_local_memory_per_block=4 * M - 1, max_threads_per_block=1))]}): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_local_memory_per_block=4 * M, max_threads_per_block=1))]}): tvm.build(s, [A, B], target) assert valid[0]
def test_num_thread(): N = 1024 M = 128 A = tvm.placeholder((N,), name='A', dtype='float32') B = tvm.compute((N, ), lambda i: A[i], name='B') s = tvm.create_schedule([B.op]) o, i = s[B].split(s[B].op.axis[0], M) s[B].bind(o, tvm.thread_axis('threadIdx.x')) s[B].bind(i, tvm.thread_axis("threadIdx.y")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))]}): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))]}): tvm.build(s, [A, B], target) assert valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N, max_thread_y=M-1))]}): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N, max_thread_y=M))]}): tvm.build(s, [A, B], target) assert valid[0]
def intrin_vadd(n, cache_read=False, cache_write=False): scope_ubuf = 'local' dtype = 'float32' x = tvm.placeholder((n,), dtype=dtype, name='vx') y = tvm.placeholder((n,), dtype=dtype, name='vy') z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z') s = tvm.create_schedule(z.op) def create_buffer(t): return tvm.decl_buffer(t.shape, t.dtype, name='W'+t.name, scope=scope_ubuf, offset_factor=16) binds = {} if cache_read: binds[x] = create_buffer(x) binds[y] = create_buffer(y) if cache_write: binds[z] = create_buffer(z) def intrin_func(ins, outs): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr'))) return ib.get() with tvm.build_config(offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
def intrin_gemv(m, l): a = tvm.placeholder((l,), name='a') b = tvm.placeholder((m, l), name='b') k = tvm.reduce_axis((0, l), name='k') c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c') Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[tvm.var("s1"), 1]) Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) def intrin_func(ins, outs): ib = tvm.ir_builder.create() aa, bb = ins cc = outs[0] ib.emit(tvm.call_extern("int32", "gemv_update", cc.access_ptr("w"), aa.access_ptr("r"), bb.access_ptr("r"), m, l, bb.strides[0])) return ib.get() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def test_double_splitting_with_indivisible_factors(): m = 48 dtype="float32" A = tvm.placeholder((m,), name='A', dtype=dtype) C = tvm.compute((m,), lambda i: A[i], name='C') D = tvm.compute((m,), lambda i: C[i], name='D') s = tvm.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = 'llvm' with tvm.build_config(partition_const_loop=True): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) # Find the beginning of the Halide IR corresponding to kernel code # and make sure it doesn't have an if statements left top_produce = find_top_produce(f.body) assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) # check functional correctness of generated code ctx = tvm.context(target, 0) a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx) c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
def test_llvm_madd_pipeline(): def check_llvm(nn, base, stride): if not tvm.module.enabled("llvm"): return n = tvm.convert(nn) A = tvm.placeholder((n + base, stride), name='A') C = tvm.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy()[base:] + 1) check_llvm(64, 0, 2) check_llvm(4, 0, 1) with tvm.build_config(restricted_func=False): check_llvm(4, 0, 3)
def intrin_gemv(m, n): w = tvm.placeholder((m, n), name='w') x = tvm.placeholder((n,), name='x') k = tvm.reduce_axis((0, n), name='k') z = tvm.compute((m,), lambda i: tvm.sum(w[i, k] * x[k], axis=k), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=16, strides=[tvm.var('ldw'), 1]) def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed( "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) reset = tvm.call_packed( "fill_zero", zz_ptr, n) update = tvm.call_packed( "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, reset, update with tvm.build_config(data_alignment=16, offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb})
def test_fold_const(): c_data = np.array([1, 2, 3]).astype("float32") def before(): c = relay.const(c_data) x = relay.var("x") y = relay.add(c, c) y = relay.multiply(y, relay.const(2, "float32")) y = relay.add(x, y) z = relay.add(y, c) return relay.Function([x], z) def expected(): x = relay.var("x") c_folded = (c_data + c_data) * 2 y = relay.add(x, relay.const(c_folded)) z = relay.add(y, relay.const(c_data)) return relay.Function([x], z) def fail(x): raise RuntimeError() # the fold constant should work on any context. with tvm.build_config(add_lower_pass=[(0, fail)]): with tvm.target.create("cuda"): zz = relay.ir_pass.fold_constant(before()) zexpected = expected() assert relay.ir_pass.alpha_equal(zz, zexpected)
def dp4a(x_scope='local', y_scope='local', z_scope='local'): """ Int8 dot product reduced by every 4 elements using __dp4a Parameters ---------- x_scope : str, optional The storage scope of buffer for lhs y_scope : str, optional The storage scope of buffer for rhs z_scope : str, optional The storage scope of buffer for result Returns ------- intrin : TensorIntrin The dp4a TensorIntrin that can be used in tensorizing schedule. """ n = 4 # dp4a requires operands packed by 4 x = tvm.placeholder((n,), name='x', dtype='int8') y = tvm.placeholder((n,), name='y', dtype='int8') k = tvm.reduce_axis((0, n), name='rc') z = tvm.compute((1,), lambda i: tvm.sum( x[k].astype('int32') * y[k].astype('int32'), axis=[k])) def _intrin_func(ins, outs): def _instr(index): xx, yy = ins zz = outs[0] if index == 1: return zz.vstore(0, 0) ib = tvm.ir_builder.create() vec_x = xx.vload(0, dtype='int8x4') vec_y = yy.vload(0, dtype='int8x4') prev_z = 0 if index == 0 else zz.vload(0) new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) ib.emit(zz.vstore(0, new_z)) return ib.get() return _instr(0), _instr(1), _instr(2) # body, reset, update with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: scopes = {x: x_scope, y: y_scope, z: z_scope} binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, data_alignment=cfg.data_alignment, offset_factor=cfg.offset_factor, scope=scopes[t]) for t in [x, y, z]} return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--target', type=str, required=True, choices=['cuda', 'rocm', 'opencl', 'metal'], help="Compilation target.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.") parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter ctx = tvm.context(args.target, 0) batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size,) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload( batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) if args.target == "cuda": unroll = 1400 else: unroll = 128 with nnvm.compiler.build_config(opt_level=opt_level): with tvm.build_config(auto_unroll_max_step=unroll, unroll_explicit=(args.target != "cuda")): graph, lib, params = nnvm.compiler.build( net, args.target, shape={"data": data_shape}, params=params) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(args.repeat): prof_res = ftimer() print(prof_res) # sleep for avoiding device overheat if i + 1 != args.repeat: time.sleep(45)
def intrin_vadd(n): x = tvm.placeholder((n,), name='vx') y = tvm.placeholder((n,), name='vy') z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z') def intrin_func(ins, outs): xx, yy = ins zz = outs[0] return tvm.call_packed("vadd", xx, yy, zz) with tvm.build_config(offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func)
def op_intrin(): bh = 9 bw = 9 x = tvm.placeholder((5, 5), name='A') y = tvm.compute((bh, bw), lambda i,j: x[j/3 + i%3, j%3+ i/3]) def intrin_func(ins, outs): xx, = ins zz = outs[0] return tvm.call_packed("op", xx, zz) with tvm.build_config(offset_factor=2): return tvm.decl_tensor_intrin(y.op, intrin_func)
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') AA = tvm.compute((n,), lambda *i: A(*i), name='A') BB = tvm.compute((n,), lambda *i: B(*i), name='B') T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T') C = tvm.compute(A.shape, lambda *i: T(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) xo1, xo2 = s[C].split(xo, factor=13) s[C].parallel(xo2) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xo2, "parallel_stride_pattern") s[C].pragma(xo2, "parallel_barrier_when_finish") s[C].vectorize(xi) def check_c(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) mhost = tvm.codegen.build_module(fsplits[0], "c") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m["fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) with tvm.build_config(offset_factor=4): check_c()
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=128, unroll_explicit=device == 'rocm'): func1 = tvm.build(s1, [A, W, B], device) func1(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) func2 = tvm.build(s2, [A, W, C], device) func2(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def build_config(debug_flag=0, **kwargs): """Build a build config for VTA. Parameters ---------- debug_flag : int The dbeug flag to be passed. kwargs : dict Additional configurations. Returns ------- build_config: BuildConfig The build config that can be used in TVM. Example -------- .. code-block:: python # build a vta module. with vta.build_config(): vta_module = tvm.build(s, ...) """ env = get_env() def add_debug(stmt): debug = tvm.call_extern( "int32", "VTASetDebugMode", env.dev.command_handle, debug_flag) return tvm.make.stmt_seq(debug, stmt) pass_list = [(1, ir_pass.inject_dma_intrin), (1, ir_pass.inject_skip_copy), (1, ir_pass.annotate_alu_coproc_scope), (1, lambda x: tvm.ir_pass.LiftAttrScope(x, "coproc_uop_scope", True)), (1, lift_coproc_scope), (1, ir_pass.inject_coproc_sync), (1, early_rewrite)] if debug_flag: pass_list.append((1, add_debug)) pass_list.append((2, ir_pass.inject_alu_intrin)) pass_list.append((3, ir_pass.fold_uop_loop)) pass_list.append((3, ir_pass.cpu_access_rewrite)) return tvm.build_config(add_lower_pass=pass_list, **kwargs)
def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b): with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True): n = 21 A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b]) s = tvm.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False) print (lowered_func.body) ctx = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx) t = tvm.nd.empty((n,), T.dtype, ctx) f(a, b, t)
def intrin_test(): m1 = tvm.var("m1") n1 = tvm.var("n1") a = tvm.placeholder((m1, n1), name='a') c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c') Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1) Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1) def intrin_func(ins, outs): aa = ins[0] cc = outs[0] def _body(): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r"))) return ib.get() return _body() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})
def test_llvm_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') AA = tvm.compute((n,), lambda *i: A(*i), name='A') BB = tvm.compute((n,), lambda *i: B(*i), name='B') T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T') C = tvm.compute(A.shape, lambda *i: T(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) xo1, xo2 = s[C].split(xo, factor=13) s[C].parallel(xo2) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xo2, "parallel_stride_pattern") s[C].pragma(xo2, "parallel_barrier_when_finish") s[C].vectorize(xi) def check_llvm(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f = tvm.build(s, [A, B, C], "llvm", binds=binds) ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) with tvm.build_config(offset_factor=4): check_llvm()
def check_llvm(n): if not tvm.module.enabled("llvm"): return with tvm.build_config(instrument_bound_checkers=True): A = tvm.placeholder((n, ), name='A') scale = tvm.placeholder((), name='scale') k = tvm.reduce_axis((0, n), name="k") C = tvm.compute((), lambda : tvm.sum(A[k] * scale, axis=k), name="C") D = tvm.compute((), lambda : C + 1) s = tvm.create_schedule(D.op) # build and invoke the kernel. f = tvm.build(s, [A, scale, D], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx) sc = tvm.nd.array( np.random.randint(0, 2, size=()).astype(scale.dtype), ctx) d = tvm.nd.empty((), D.dtype, ctx) f(a, sc, d) d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1 tvm.testing.assert_allclose(d.asnumpy(), d_np)
def test_wrong_bind(): N = 1024 A = tvm.placeholder((N, N-1), name='A') B = tvm.compute((N, N-1), lambda i, j: A[i, j]) s = tvm.create_schedule([B.op]) # bind a thread axis to two loop axes with different lengths s[B].bind(s[B].op.axis[0], tvm.thread_axis("threadIdx.x")) s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x")) for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_threads_per_block=N*N))]}): tvm.build(s, [A, B], target) assert not valid[0]
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s1 = topi.generic.schedule_conv2d_nchw([B]) s2 = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) func1(a, w, b) func2(a, w, c) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def initialize_variables(ishape, idtype): """ Initialize variables stored in _all_var_init dictionary. Parameters ---------- ishape : dict of str to tuple of int The input shape to the graph idtype : str or dict of str to str The input types to the graph Returns ------- init_var : dict of str to tvm.ndarray """ symbol_init_dict = {} const_init_dict = {} init_var = {} for key, value in _all_var_init.items(): if isinstance(value, sym.Symbol): symbol_init_dict[key] = value else: const_init_dict[key] = tvm.nd.array(value) # Make sure variables are initialized only once. _all_var_init.clear() if symbol_init_dict: # Create dummy params to run initialization graph params = {} for name, shape in ishape.items(): dtype = idtype if isinstance(idtype, str) else idtype[name] params[name] = tvm.nd.empty(shape, dtype, ctx=tvm.cpu()) init_group_sym = sym.Group(symbol_init_dict.values()) graph = _graph.create(init_group_sym) with tvm.build_config(auto_unroll_max_step=0): init_values = _run_graph(graph, params) init_var.update(dict(zip(symbol_init_dict.keys(), init_values))) init_var.update(const_init_dict) for name, data in init_var.items(): ishape[name] = data.shape return init_var
def intrin_multivadd(n): n_a = tvm.var("n_a") Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a]) n_b = tvm.var("n_b") Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b]) n_c = tvm.var("n_c") Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c]) z = tvm.compute((n,), lambda i: tvm.call_extern("float32", 'vadd', Ab.access_ptr("w", offset=n_a*i), Bb.access_ptr("r", offset=n_b*i), Cb.access_ptr("r", offset=n_c*i))) # replace the pattern with the multivadd call. I need to figure out # how to pass it the right parameters. def intrin_func(ins, outs): return tvm.call_packed("multivadd") with tvm.build_config(): return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
def intrin_vadd(n): dtype = 'float32' x = tvm.placeholder((n,), dtype=dtype, name='vx') y = tvm.placeholder((n,), dtype=dtype, name='vy') z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z') s = tvm.create_schedule(z.op) def create_buffer(t): return tvm.decl_buffer(t.shape, t.dtype, name='W'+t.name, offset_factor=16) def intrin_func(ins, outs): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern("float32", 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr'))) return ib.get() with tvm.build_config(offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={x: create_buffer(x), y: create_buffer(y), z: create_buffer(z)})
def check_device(target): with tvm.build_config( detect_global_barrier=detect_global_barrier, auto_unroll_max_step=128, unroll_explicit=False): f = tvm.build(s, [s_scan, Whh], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden//2:] = 0 res_a = tvm.nd.array(res_np, ctx) Whh_a = tvm.nd.array(Whh_np, ctx) # Skip first pass as it is compilation f(res_a, Whh_a) ctx.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5: print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j])) tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0): global TASK # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") if type == "sum": TASK = "sum_map_id%d" %test_id B = topi.sum(A, axis=axis, keepdims=keepdims) elif type == "max": TASK = "max_map_id%d" %test_id B = topi.max(A, axis=axis, keepdims=keepdims) elif type == "min": TASK = "min_map_id%d" %test_id B = topi.min(A, axis=axis, keepdims=keepdims) else: raise NotImplementedError s = topi.cuda.schedule_reduce(B) with tvm.build_config(auto_unroll_max_step=16, auto_unroll_min_depth=0): fcuda = tvm.build(s, [A, B], "cuda", name="sum") # Test in_npy = np.random.normal(size=in_shape).astype(np.float32) if type == "sum": out_npy = in_npy.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy.min(axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu()) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu()) for _ in range(2): fcuda(data_tvm, out_tvm) tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
def precompute_prune(graph, params): """Precompute the part of graph that can be pre-computed. This will create a new graph that only contains the ops that need to be computed depending on input as well as updated version of param dict that pre-computes some of intermediate results. Parameters ---------- graph : Graph The input graph params : dict of str -> tvm.NDArray The parameter dictionary of the graph Returns ------- pruned_graph : Graph The pruned graph new_params : dict of str-> tvm.NDArray The updated dictionary of parameters. """ graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph) graph._set_json_attr("param_name_list", list(params.keys()), "list_str") graph = graph.apply("PrecomputePrune") pre_graph = graph_attr._move_out_graph(graph, "precompute_graph") if pre_graph is None: return graph, params out_names = pre_graph.json_attr("output_names") if not pre_graph.symbol.list_output_names(): return graph, params with tvm.build_config(auto_unroll_max_step=0): out_arrs = _run_graph(pre_graph, params) return graph, dict(zip(out_names, out_arrs))
def test_num_thread(): N = 1024 M = 128 A = tvm.placeholder((N, ), name='A', dtype='float32') B = tvm.compute((N, ), lambda i: A[i], name='B') s = tvm.create_schedule([B.op]) o, i = s[B].split(s[B].op.axis[0], M) s[B].bind(o, tvm.thread_axis('threadIdx.x')) s[B].bind(i, tvm.thread_axis("threadIdx.y")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config( **{ "add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))] }): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config( **{ "add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))] }): tvm.build(s, [A, B], target) assert valid[0] with tvm.build_config( **{ "add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N, max_thread_y=M - 1))] }): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config( **{ "add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N, max_thread_y=M))] }): tvm.build(s, [A, B], target) assert valid[0]
if "vta" in target: sym = vta.graph.pack(sym, shape_dict, factor) graph_attr.set_shape_inputs(sym, shape_dict) sym = sym.apply("InferShape") graph_attr.set_dtype_inputs(sym, dtype_dict) sym = sym.apply("InferType") timers['execution_time_prepare_graph'] = time.time() - dt with nnvm.compiler.build_config(opt_level=3): bdict = {} if "vta" not in target: bdict = {"add_lower_pass": []} else: bdict = {"add_lower_pass": vta.debug_mode(0)} with tvm.build_config(**bdict): graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype_dict, params=params) print("connecting ...") dt = time.time() remote = rpc.connect(host, port) temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) timers['execution_time_upload_graph'] = time.time() - dt lib = remote.load_module("graphlib.o") ctx = remote.ext_dev(0) if "vta" in target else remote.cpu(0)
def lstm(): if not PERSIST_KERNEL: raise ValueError("Non persist LSTM not yet supported") num_thread_y = 8 num_thread_x = 16 * 3 // 2 num_sm = 24 n_num_step = 128 num_step = tvm.te.var('num_step') num_hidden = 1152 // 2 batch_size = 1 # Global transition matrix # Input hidden channel can be pre-caculated by a gemm Xi2h = tvm.te.placeholder((num_step, batch_size, 4, num_hidden), name="Xi2h") # Only handle hidden transition, saves space. Wh2h = tvm.te.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.te.placeholder((num_step, batch_size, num_hidden)) s_state_c = tvm.te.placeholder((num_step, batch_size, num_hidden)) s_init_c = tvm.te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") s_init_h = tvm.te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # LSTM transition k = tvm.te.reduce_axis((0, num_hidden), name="ki2h") s_h2h = tvm.te.compute( (num_step, batch_size, 4, num_hidden), lambda t, i, x, j: tvm.te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules gates = tvm.te.compute(Xi2h.shape, lambda *i: Xi2h(*i) + s_h2h(*i), name="gates") gshape = (num_step, batch_size, num_hidden) in_gate = tvm.te.compute(gshape, lambda t, i, j: tvm.te.sigmoid(gates[t, i, 0, j]), name="in_gate") in_transform = tvm.te.compute( gshape, lambda t, i, j: tvm.te.tanh(gates[t, i, 1, j]), name="in_transform") forget_gate = tvm.te.compute( gshape, lambda t, i, j: tvm.te.sigmoid(gates[t, i, 2, j]), name="forget_gate") out_gate = tvm.te.compute( gshape, lambda t, i, j: tvm.te.sigmoid(gates[t, i, 3, j]), name="out_gate") next_c = tvm.te.compute( gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[ t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") next_h = tvm.te.compute( gshape, lambda t, i, j: out_gate[t, i, j] * tvm.te.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.te.compute(gshape, lambda *i: next_c(*i), name="update_c") update_h = tvm.te.compute(gshape, lambda *i: next_h(*i), name="update_h") # schedule scan_h, scan_c = tvm.te.scan([s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[Xi2h], name="lstm_scan") # schedule s = tvm.te.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() block_x = tvm.te.thread_axis((0, num_sm), "blockIdx.x") thread_x = tvm.te.thread_axis((0, num_thread_x), "threadIdx.x") thread_y = tvm.te.thread_axis((0, num_thread_y), "threadIdx.y") s_state_h_S = s.cache_read(s_state_h, "shared", [s_h2h]) print(s[s_state_h_S].op.axis, s[s_state_h_S].op.reduce_axis) s_state_c_S = s.cache_read(s_state_c, "shared", [next_c]) Wh2hL = s.cache_read(Wh2h, "local", [s_h2h]) ko, ki = s[s_h2h].split(s[s_h2h].op.reduce_axis[0], nparts=num_thread_y) s_h2h_rf = s.rfactor(s_h2h, ko) print(s[s_h2h_rf].op.axis, s[s_h2h_rf].op.reduce_axis) print(s[s_h2h].op.axis, s[s_h2h].op.reduce_axis) print(s[s_h2h_rf].op.input_tensors) s[s_h2h].bind(s[s_h2h].op.reduce_axis[0], thread_y) s[s_h2h_rf].compute_at(s[s_h2h], s[s_h2h].op.reduce_axis[0]) if PERSIST_KERNEL: s[scan_h.op].env_threads([block_x, thread_y, thread_x]) s[Wh2hL].compute_at(s[scan_h.op], thread_x) else: s[Wh2hL].compute_at(s[s_h2h], s[s_h2h].op.axis[3]) if UNROLL_WLOAD: s[Wh2hL].unroll(Wh2hL.op.axis[0]) s[Wh2hL].unroll(Wh2hL.op.axis[2]) s[s_state_h_S].compute_at(s[s_h2h_rf], s[s_h2h_rf].op.axis[3]) s[s_state_c_S].compute_at(s[scan_h.op], s[scan_h].op.scan_axis) print(s[s_state_h_S].op.axis, s[s_state_h_S].op.reduce_axis) for ss in [s_state_h_S]: xo, xi = s[ss].split(ss.op.axis[2], factor=num_thread_x * num_thread_y) ty, xi = s[ss].split(xi, nparts=num_thread_y) tx, xi = s[ss].split(xi, nparts=num_thread_x) s[ss].bind(ty, thread_y) s[ss].bind(tx, thread_x) for init in [s_init_c, s_init_h]: bx, xi = s[init].split(init.op.axis[2], nparts=num_sm) tx, xi = s[init].split(xi, nparts=num_thread_x) s[init].bind(bx, block_x) s[init].bind(tx, thread_x) # s[next_c].set_store_predicate(thread_y.equal(0)) # s[next_h].set_store_predicate(thread_y.equal(0)) for update in [update_c, update_h]: bx, xi = s[update].split(s[update].op.axis[2], nparts=num_sm) tx, xi = s[update].split(xi, nparts=num_thread_x) s[update].bind(bx, block_x) s[update].bind(tx, thread_x) # s[update].set_store_predicate(thread_y.equal(0)) # verify we can lower correctly def check_device(target): num_step = n_num_step print(tvm.lower(s, [Xi2h, Wh2h, scan_h, scan_c], simple_mode=True)) flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean) # set unroll_explicit for more readable code. with tvm.build_config(detect_global_barrier=DETECT_GLOBAL_BARRIER, auto_unroll_max_step=128, unroll_explicit=False): check_device("cuda")
def intrin_col2im(input_shape, output_shape, kernel, stride, pad, dtype): ''' Compute col2im via cce col2im intrin function call directly Args: input_shape: the shape of the image output_shape: the shape of the result of im2col given the input image kernel: kernel sizes for im2col stride: stride sizes for im2col pad: padding sizes for im2col, including padding top, bottom, left, and right dtype: type of the data Return: cce intrin function call for col2im ''' _, _, _, _, WINDOW_H, WINDOW_W, _ = input_shape _, _, H, W, _ = output_shape kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_t, pad_b, pad_l, pad_r = pad assert ( WINDOW_H * WINDOW_W ) % 16 == 0, "Number of windows over the input must be divisible by 16 (col2im repeat)" assert ( H * W * 16) % 64 == 0, "Input size must be divisible by 64 (vector_dup repeat)" # FCOL2IMG ------------------------------------------- INPUT_W = W INPUT_H = H PAD_LEFT = pad_l PAD_RIGHT = pad_r PAD_TOP = pad_t PAD_BOTTOM = pad_b # --------------------------------------------------- # Xm ------------------------------------------------ W_IDX_KERNEL = 0 H_IDX_KERNEL = 0 H_IDX = (-pad_l) & 0xFFFF # fix negative numbers W_IDX = (-pad_t) & 0xFFFF C1_IDX = 0 # --------------------------------------------------- # Xt ------------------------------------------------ STRIDE_H = stride_h STRIDE_W = stride_w KERNEL_H = kernel_h KERNEL_W = kernel_w DILATION_H = 1 DILATION_W = 1 JUMP_OFFSET = 0 REPEAT_MODE = 1 REPEAT_TIME = (WINDOW_H * WINDOW_W) // 16 # --------------------------------------------------- INPUT_B = 1 INPUT_C1 = 1 INPUT_C0 = 16 input_data = tvm.placeholder( (INPUT_B, INPUT_C1, KERNEL_H, KERNEL_W, WINDOW_H, WINDOW_W, INPUT_C0), dtype=dtype) result = tvm.compute( (INPUT_B, INPUT_C1, INPUT_H, INPUT_W, INPUT_C0), lambda b, c1, h, w, c0: input_data[b, c1, h % KERNEL_H, w % KERNEL_W, h % WINDOW_H, w % WINDOW_W, c0], name="col2im_intrinsic", ) input_data_buff = tvm.decl_buffer(input_data.shape, input_data.dtype, name="input_data_buff", offset_factor=1, scope="local.UB") result_buff = tvm.decl_buffer(result.shape, result.dtype, name="result_buff", offset_factor=1, scope="local.UB") def pack_args(sp): assert len(sp) == 20 fcol2img = (akg.tvm.const(sp[0], "uint64") + akg.tvm.const(sp[1] * 2**16, "uint64") + akg.tvm.const(sp[2] * 2**32, "uint64") + akg.tvm.const(sp[3] * 2**40, "uint64") + akg.tvm.const(sp[4] * 2**48, "uint64") + akg.tvm.const(sp[5] * 2**56, "uint64")) Xm = (akg.tvm.const(sp[6] * 2**16, "uint64") + akg.tvm.const(sp[7] * 2**24, "uint64") + akg.tvm.const(sp[8] * 2**32, "uint64") + akg.tvm.const(sp[9] * 2**48, "uint64") + akg.tvm.const(sp[10], "uint64")) Xt = (akg.tvm.const(sp[11], "uint64") + akg.tvm.const(sp[12] * 2**6, "uint64") + akg.tvm.const(sp[13] * 2**12, "uint64") + akg.tvm.const(sp[14] * 2**20, "uint64") + akg.tvm.const(sp[15] * 2**28, "uint64") + akg.tvm.const(sp[16] * 2**36, "uint64") + akg.tvm.const(sp[17] * 2**44, "uint64") + akg.tvm.const(sp[18] * 2**52, "uint64") + akg.tvm.const(sp[19] * 2**56, "uint64")) return (fcol2img, Xm, Xt) def intrin_func(ins, outs): sp = [ INPUT_W, INPUT_H, PAD_LEFT, PAD_RIGHT, PAD_TOP, PAD_BOTTOM, # FMATRIX W_IDX_KERNEL, H_IDX_KERNEL, W_IDX, H_IDX, C1_IDX, # Xm STRIDE_W, STRIDE_H, KERNEL_W, KERNEL_H, DILATION_W, DILATION_H, JUMP_OFFSET, REPEAT_MODE, REPEAT_TIME, # Xt ] aa = ins[0] bb = outs[0] ib = tvm.ir_builder.create() fcol2img, Xm, Xt = pack_args(sp) ib.emit(tvm.call_extern(dtype, "set_fcol2img", fcol2img)) ib.emit( tvm.call_extern(dtype, "vector_dup", bb.access_ptr("w"), 0, (INPUT_H * INPUT_W * 16) // 64, 1, 1, 8, 8)) c = 0 for kh in range(KERNEL_H): for kw in range(KERNEL_W): sp[6] = kw sp[7] = kh fcol2img, Xm, Xt = pack_args(sp) ib.emit( tvm.call_extern( dtype, "col2img", bb.access_ptr("rw"), aa.access_ptr("r", offset=c * 16 * INPUT_C0 * REPEAT_TIME), Xm, Xt, )) c += 1 return ib.get() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(result.op, intrin_func, binds={ input_data: input_data_buff, result: result_buff })
def dot_16x1x16_int8_int8_int32(): """ Int8 dot product by every 4 elements using AVX2 Skylake instructions. This function takes two arrays of int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ out[i] = 0; for (int k = 0; k < 4; k++){ out[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = 16 # 16 int32 lanes in AVX512 num_int8_elements = 4 # 4 int8 elements in int32 data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data') kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel') k = tvm.reduce_axis((0, num_int8_elements), name='k') C = tvm.compute((int32_lanes,), lambda i: tvm.sum(data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k), name="C") a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", offset_factor=1, strides=[tvm.var('ldw'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vec_a, vec_b) quad_reduction = tvm.call_llvm_intrin('int32x16', 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), pair_reduction, vec_one) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--target', type=str, required=True, choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'], help="Compilation target.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.") parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter ctx = tvm.context(args.target, 0) batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload(batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) if args.target == "cuda": unroll = 1400 else: unroll = 128 with nnvm.compiler.build_config(opt_level=opt_level): with tvm.build_config(auto_unroll_max_step=unroll, unroll_explicit=(args.target != "cuda")): graph, lib, params = nnvm.compiler.build( net, args.target, shape={"data": data_shape}, params=params) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(args.repeat): prof_res = ftimer() print(prof_res) # sleep for avoiding device overheat if i + 1 != args.repeat: time.sleep(45)
def intrin_libxsmm_tuned(ofmblock, ofw, ifmblock, stride_width, ifw, rco, ifh, r, s, ifh_stride, ifw_stride, in_channel): last_input_width_index = (ofw - 1) * stride_width + s - 1 A = tvm.placeholder((rco, r, s, ifmblock, ofmblock), name='w') B = tvm.placeholder((rco, r, last_input_width_index + 1, ifmblock), name='b') k = tvm.reduce_axis((0, ifmblock), name='k') k_outer = tvm.reduce_axis((0, rco), name='k_outer') ry = tvm.reduce_axis((0, r), name='ry') rx = tvm.reduce_axis((0, s), name='rx') C = tvm.compute((ofw, ofmblock), lambda m, n: tvm.sum(A[k_outer, ry, rx, k, n] * B[ k_outer, ry, rx + m * stride_width, k], axis=[k_outer, ry, rx, k]), name='out') s1 = tvm.create_schedule(C.op) w, ofm = s1[C].op.axis kco, ky, kx, kci = s1[C].op.reduce_axis s1[C].reorder(kco, ky, kx, w, ofm, kci) xx_ptr = tvm.decl_buffer(A.shape, A.dtype, name="W", offset_factor=1, data_alignment=64) yy_ptr = tvm.decl_buffer( B.shape, B.dtype, name="some", offset_factor=1, strides=[tvm.var("s3"), tvm.var("s2"), ifmblock, 1], data_alignment=64) zz_ptr = tvm.decl_buffer(C.shape, C.dtype, name="OUT", offset_factor=1, data_alignment=64) def intrin_func(ins, outs): # tvm call extern is used to interface to libxsmm batch reduce kernel gemm implementation # rco*r*s is the number of batches init_and_compute = tvm.call_extern ("int32","batch_reduce_kernel_init_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"),\ rco*r*s,ofmblock,ifmblock,r,s,ifh_stride,ifw_stride, ofw, stride_width) reset = tvm.call_extern("int32", "batch_reduce_kernel_init", outs[0].access_ptr("w"), ofmblock, ofw) body = tvm.call_extern ("int32","batch_reduce_kernel_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"), rco*r*s,ofmblock,\ ifmblock,ofw, stride_width,r,s, ifh_stride,ifw_stride) if math.ceil(in_channel / ifmblock) == rco: return init_and_compute, None, init_and_compute else: return init_and_compute, reset, body with tvm.build_config(data_alignment=64): return tvm.decl_tensor_intrin(C.op, intrin_func, name="GEMM", binds={ A: xx_ptr, B: yy_ptr, C: zz_ptr })
s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a')) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b')) s[Conv].tensorize(nnc, intrin_wmma_store_matrix()) s[ConvF].tensorize(nnf, intrin_wmma_gemm()) print(tvm.lower(s, [A, W, Conv], simple_mode=True)) ############################################################################### # Generate CUDA Kernel # -------------------- # Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution. # Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not # be able to run on our build server ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.build_config(auto_unroll_max_step=16): func = tvm.build(s, [A, W, Conv], 'cuda') a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=10) print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3)) ############################################################################### # Summary # This tutorial demonstrates how TVM scheduling primitives can be used to # call TensorCores on specific GPUs.
def dot_int8_int8_int32(int32_lanes, dtype='uint'): """ Int8 dot product by every 4 elements using ARM v8.2 udot. This function takes two arrays of int8 datatype -- data[4] and kernel[int32_lanes][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype. The pseudo code is as follows. .. code-block:: c void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < int32_lanes; i++){ out[i] = 0; for (int k = 0; k < 4; k++){ out[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in a vector register and the data[4] is broadcasted to another vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Parameters ---------- int32_lanes: int How many int32/uint32 to produce dtype: str, optional, {"uint", "int"} Whether it works on unsigned int or signed int Returns ------- intrin : TensorIntrin The ARM uint8 TensorIntrin that can be used in tensorizing schedule """ num_int8_elements = 4 # 4 int8 elements in int32 data = tvm.placeholder((num_int8_elements, ), dtype='%s8' % dtype, name='data') kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='%s8' % dtype, name='kernel') k = tvm.reduce_axis((0, num_int8_elements), name='k') C = tvm.compute((int32_lanes, ), lambda i: tvm.sum(data[k].astype('%s32' % dtype) * kernel[ i, k].astype('%s32' % dtype), axis=k), name="C") a_buffer = tvm.decl_buffer(data.shape, dtype='%s8' % dtype, name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.decl_buffer(kernel.shape, dtype='%s8' % dtype, name="b_buffer", offset_factor=1, strides=[tvm.var('s'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.ir_builder.create() if index == 1: ib.emit(outs[0].vstore( 0, tvm.const(0, '%s32x%d' % (dtype, int32_lanes)))) return ib.get() dtype_a = '%s8x%d' % (dtype, num_int8_elements) dtype_b = '%s8x%d' % (dtype, int32_lanes * num_int8_elements) dtype_c = '%s32x%d' % (dtype, int32_lanes) a_int8 = ins[0].vload([0], dtype_a) re_int32 = tvm.call_pure_intrin('%s32' % dtype, 'reinterpret', a_int8) # broadcast a vec_ai32 = re_int32.astype(dtype_c) vec_a = tvm.call_pure_intrin(dtype_b, 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], dtype_b) vec_c = outs[0].vload([0], dtype_c) inst = 'udot' if dtype == 'uint' else 'sdot' inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % ( inst, int32_lanes, int32_lanes * num_int8_elements) vdot = tvm.call_llvm_intrin(dtype_c, inst, tvm.const(2, 'uint32'), vec_c, vec_a, vec_b) ib.emit(outs[0].vstore(0, vdot)) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer })
def tune_and_evaluate(M, N, L, dtype, layout): task = autotvm.task.create(test_gemm, args=(N, L, M, dtype, layout), target='cuda') print(task.config_space) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=5)) tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=1000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('matmul.log')]) dispatch_context = autotvm.apply_history_best("matmul.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) with autotvm.apply_history_best('matmul.log'): with tvm.target.create("cuda"): with tvm.build_config(): s, arg_bufs = test_gemm(N, L, M, dtype, layout) print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs) dev_module = func.imported_modules[0] print(dev_module.get_source()) # check correctness if (layout == "NN"): shape_a = (N, L) shape_b = (L, M) elif (layout == "NT"): shape_a = (L, N) shape_b = (L, M) elif (layout == "TN"): shape_a = (N, L) shape_b = (M, L) elif (layout == "TT"): shape_a = (L, N) shape_b = (M, L) a_np = None b_np = None c_np = None c_np_type = None if dtype == 'float16': c_np_type = np.float32 a_np = np.random.uniform(size=shape_a).astype(np.float16) b_np = np.random.uniform(size=shape_b).astype(np.float16) if (layout == "NN"): c_np = np.dot(a_np, b_np) elif (layout == "NT"): c_np = np.dot(a_np.T, b_np) elif (layout == "TN"): c_np = np.dot(a_np, b_np.T) elif (layout == "TT"): c_np = np.dot(a_np.T, b_np.T) elif dtype == 'int8': c_np_type = np.int32 a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8) b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8) if (layout == "NN"): c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32)) elif (layout == "NT"): c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32)) elif (layout == "TN"): c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T) elif (layout == "TT"): c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T) c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx) a_tvm = tvm.nd.array(a_np, ctx=ctx) b_tvm = tvm.nd.array(b_np, ctx=ctx) func(a_tvm, b_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3) evaluator = func.time_evaluator(func.entry_name, ctx, number=100) print('Time cost of this operator: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)
##################################################################### # In TVM, there is a property called ``BuildConfig``. You can use this property to customize your # own lowering options. In this case, we inject the pass written above into the TVM standard lowering # pass by feeding **a list of tuple** as argument to ``add_lower_pass``. "Tuple" indicates different # phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be # called after each phase is done. # # .. note:: # Here are the essential transformations done by each phase: # - Phase 0 generates the raw IR and loop levels. # - Phase 1 flattens the array storage. # - Phase 2 transforms loops, like unroll, vectorization and thread-binding. # - Phase 3 does some cleanup work. # # Thus, a good place to put this transformation pass is just after Phase 1. # with tvm.build_config(add_lower_pass=[(1, vectorize)]) as cfg: print(tvm.lower(sch, [a, b, c], simple_mode=True)) ##################################################################### # Quick View # ---------- # This tutorial gives a quick view of writing a customized IR transformation pass: # - Use ``tvm.ir_pass.PostOrderVisit`` to gather information on each IR nodes. # - Use ``tvm.ir_pass.IRTransform`` to transform IR nodes. # - Wrap up two above to write an IR-transformation function. # - Use ``tvm.build_config`` to put this function to TVM lowering pass #
def check_device(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel_size, kernel_size), name='W') out_dtype = 'float32' wkl = _get_workload(A, W, stride, padding, out_dtype) sch = Im2ColPack(7, 8, 1, 8, True) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() # device = 'llvm' device = 'llvm -mcpu=skylake-avx512' ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): B = _im2col_pack(wkl, sch, A, W, stride, padding, out_dtype) s = tvm.create_schedule(B.op) traverse(s, B.op) op = B.op output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data_col = data_vec.op.input_tensors[0] data = data_col.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] _schedule_im2col_conv2d(wkl, sch, s, data, data_pad, data_col, data_vec, kernel, kernel_vec, conv_out, output, B) print(tvm.lower(s, [A, W, B], simple_mode=True)) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a, w, b).mean print('conv: %g secs/op' % cost) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print(b_np.shape)
def dot_16x1x16_int8_int8_int32(): """ Int8 dot product by every 4 elements using AVX2 Skylake instructions. This function takes two arrays of int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ out[i] = 0; for (int k = 0; k < 4; k++){ out[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = 16 # 16 int32 lanes in AVX512 num_int8_elements = 4 # 4 int8 elements in int32 data = tvm.placeholder((num_int8_elements, ), dtype='uint8', name='data') kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel') k = tvm.reduce_axis((0, num_int8_elements), name='k') C = tvm.compute( (int32_lanes, ), lambda i: tvm.sum( data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k), name="C") a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", offset_factor=1, strides=[tvm.var('ldw'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin( 'int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vec_a, vec_b) quad_reduction = tvm.call_llvm_intrin( 'int32x16', 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), pair_reduction, vec_one) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore( 0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer })
def check_device(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') out_dtype = 'float32' wkl, sch_default = _spatial_get_sch(A, W, stride, padding, out_dtype) sch = sch_default if schedule is None else schedule a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() # device = 'llvm' device = 'llvm -mcpu=skylake-avx512' ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): print('--- schedule data packing ---') A_vec, s = _spatial_pack_data_only(wkl, sch, A) print(A_vec.shape) a_vec_shape = get_const_tuple(A_vec.shape) a_vec = tvm.nd.array(np.zeros(a_vec_shape, dtype=dtype), ctx) print(tvm.lower(s, [A, A_vec], simple_mode=True)) func = tvm.build(s, [A, A_vec], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a, a_vec).mean print('data -> data_vec: %g secs/op' % cost) print('--- schedule kernel packing ---') W_vec, s = _spatial_pack_kernel_only(wkl, sch, W) print(W_vec.shape) w_vec_shape = get_const_tuple(W_vec.shape) w_vec = tvm.nd.array(np.zeros(w_vec_shape, dtype=dtype), ctx) # print(tvm.lower(s, [W, W_vec], simple_mode=True)) func = tvm.build(s, [W, W_vec], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(w, w_vec).mean print('kernel -> kernel_vec: %g secs/op' % cost) print('--- schedule conv & unpack ---') A_vec = tvm.placeholder(a_vec_shape, name='A_vec') W_vec = tvm.placeholder(w_vec_shape, name='W_vec') B, s = _spatial_conv_only(wkl, sch, A_vec, W_vec, out_dtype=dtype) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) # print(tvm.lower(s, [A_vec, W_vec, B], simple_mode=True)) func = tvm.build(s, [A_vec, W_vec, B], target=device) func.save('conv_unpack.asm') time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a_vec, w_vec, b).mean print('conv & unpack: %g secs/op' % cost) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print(b_np.shape)
def test_gemm(): # graph nn = 2048 n = tvm.var('n') n = tvm.convert(nn) m, l = n, n A = tvm.placeholder((l, n), name='A') B = tvm.placeholder((l, m), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((m, n), lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k), name='C') # schedule s = tvm.create_schedule(C.op) AA = s.cache_read(A, "shared", [C]) BB = s.cache_read(B, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_thread) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis ko, ki = s[CC].split(k, factor=8) kt, ki = s[CC].split(ki, factor=1) s[CC].reorder(ko, kt, ki, yo, xo) s[AA].compute_at(s[CC], ko) s[BB].compute_at(s[CC], ko) s[CC].unroll(kt) s[AL].compute_at(s[CC], kt) s[BL].compute_at(s[CC], kt) # Schedule for A's shared memory load ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) s[AA].vectorize(xi) # Schedule for B' shared memory load ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].vectorize(xi) s[AA].double_buffer() s[BB].double_buffer() # correctness def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Device %s" % device) f = tvm.build(s, [A, B, C], device) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) for i in range(2): f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) num_flops = 2 * nn * nn * nn num_runs = 10 timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) t = timer_f(a, b, c).mean GFLOPS = num_flops / (t * 1e3) / 1e6 print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS)) for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]: with tvm.build_config(auto_unroll_max_step=128, unroll_explicit=(device != "cuda")): check_device(device)
def _intrin_popcount(m, k_i, w_b, x_b, unipolar): pack_dtype = 'uint8' w = tvm.placeholder((w_b, m, k_i), dtype=pack_dtype, name='w') x = tvm.placeholder(( x_b, k_i, ), dtype=pack_dtype, name='x') k = tvm.reduce_axis((0, k_i), name='k') bw = tvm.reduce_axis((0, w_b), name='bw') bx = tvm.reduce_axis((0, x_b), name='bx') if unipolar: dtype = 'int16' z = tvm.compute( (m, ), lambda i: tvm.sum((tvm.popcount(w[bw, i, k].astype(dtype) & x[ bx, k].astype(dtype)) - tvm.popcount(~w[bw, i, k].astype( dtype) & x[bx, k].astype(dtype))) << (bw + bx).astype(dtype), axis=[bw, bx, k]), name='z') else: dtype = 'uint16' z = tvm.compute((m, ), lambda i: tvm.sum(tvm.popcount(w[bw, i, k].astype( dtype) & x[bx, k].astype(dtype)) << (bw + bx).astype(dtype), axis=[bw, bx, k]), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=k_i, strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) # stride can be inferred Xb = tvm.decl_buffer(x.shape, x.dtype, name="X", offset_factor=k_i, strides=[tvm.var('ldw'), 1]) Zb = tvm.decl_buffer(z.shape, z.dtype, name="Z", offset_factor=1, strides=[1]) def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') if unipolar: vpadd = "llvm.arm.neon.vpadd.v8i8" vpadalu = "llvm.arm.neon.vpadals.v16i8.v8i16" full_dtype = 'int8x16' half_dtype = 'int8x8' return_dtype = 'int16x8' else: vpadd = "llvm.arm.neon.vpadd.v8u8" vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" full_dtype = 'uint8x16' half_dtype = 'uint8x8' return_dtype = 'uint16x8' def _instr(index): irb = tvm.ir_builder.create() if index == 1: # reduce reset irb.emit(zz.vstore(0, tvm.const(0, return_dtype))) return irb.get() # body and reduce update cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): w_ = ww.vload([bw, i, 0], 'uint8x16').astype(full_dtype) x_ = xx.vload([bx, 0], 'uint8x16').astype(full_dtype) if unipolar: cnts = tvm.popcount(w_ & x_) - tvm.popcount(~w_ & x_) else: cnts = tvm.popcount(w_ & x_) upper_half = tvm.call_pure_intrin( half_dtype, 'vectorhigh', cnts) lower_half = tvm.call_pure_intrin( half_dtype, 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( half_dtype, vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( half_dtype, vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin(full_dtype, 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, pack_dtype) out = tvm.call_llvm_intrin(return_dtype, vpadalu, args_2, zz.vload(0, return_dtype), shifted_cnts) else: # ki == 8 for i in range(m): w_ = ww.vload([bw, i, 0], 'uint8x8').astype(half_dtype) x_ = xx.vload([bx, 0], 'uint8x8').astype(half_dtype) if unipolar: cnts8[i] = tvm.popcount( w_ & x_) - tvm.popcount(~w_ & x_) else: cnts8[i] = tvm.popcount(w_ & x_) for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( half_dtype, vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( half_dtype, vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin(full_dtype, 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, pack_dtype) out = tvm.call_llvm_intrin(return_dtype, vpadalu, args_2, zz.vload(0, return_dtype), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={ w: Wb, x: Xb, z: Zb })
def test_gemm(): # graph nn = 2048 n = tvm.var('n') n = tvm.convert(nn) m, l = n, n A = tvm.placeholder((l, n), name='A') B = tvm.placeholder((l, m), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((m, n), lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k), name='C') # schedule s = tvm.create_schedule(C.op) AA = s.cache_read(A, "shared", [C]) BB = s.cache_read(B, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_thread) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis ko, ki = s[CC].split(k, factor=8) kt, ki = s[CC].split(ki, factor=1) s[CC].reorder(ko, kt, ki, yo, xo) s[AA].compute_at(s[CC], ko) s[BB].compute_at(s[CC], ko) s[AL].compute_at(s[CC], kt) s[BL].compute_at(s[CC], kt) # Schedule for A's shared memory load ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) s[AA].vectorize(xi) # Schedule for B' shared memory load ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].vectorize(xi) # correctness def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) for i in range(2): f(a, b, c) np.testing.assert_allclose(c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) with tvm.build_config(auto_unroll_max_step=32, auto_unroll_min_depth=0, unroll_explicit=False): check_device("cuda")
def test_depthwise_conv2d_nchw(): """You may test different settings.""" batch = 1 in_channel = 256 in_height = 96 in_width = 96 filter_channel = in_channel channel_multiplier = 1 filter_height = 3 filter_width = 3 stride_h = 1 stride_w = 1 padding = 'SAME' # or 'VALID' # Placeholder Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input') Filter = tvm.placeholder( (filter_channel, channel_multiplier, filter_height, filter_width), name='Filter') Stride = [stride_h, stride_w] Scale = tvm.placeholder((in_channel * channel_multiplier, ), name='Scale') Shift = tvm.placeholder((in_channel * channel_multiplier, ), name='Shift') # Declare DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, Stride, padding) ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # Schedule s1 = schedule_depthwise_conv2d_nchw(DepthwiseConv2d) s2 = schedule_depthwise_conv2d_nchw(ScaleShift) s3 = schedule_depthwise_conv2d_nchw(Relu) input_np = np.random.uniform(size=get_const_tuple(Input.shape)).astype( Input.dtype) filter_np = np.random.uniform(size=get_const_tuple(Filter.shape)).astype( Filter.dtype) scale_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Scale.dtype) shift_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Shift.dtype) def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3 * 1e6)) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[ c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success") for device in ['cuda', 'opencl', 'rocm']: with tvm.build_config(auto_unroll_max_step=128, unroll_explicit=device == 'rocm', detect_global_barrier=False, restricted_func=True): check_device(device)
print(stmt) # build and invoke the kernel. f = tvm.build(s, [A, scale, D], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.randint(0, 2, size=(n, )).astype(A.dtype), ctx) sc = tvm.nd.array( np.random.randint(0, 2, size=()).astype(scale.dtype), ctx) d = tvm.nd.empty((), D.dtype, ctx) f(a, sc, d) d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1 tvm.testing.assert_allclose(d.asnumpy(), d_np) if __name__ == "__main__": with tvm.build_config(instrument_bound_checkers=True): # zero scale test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm( ) # in bound test_in_bounds_llvm() # upper bound test_out_of_bounds_llvm(1, 0) test_out_of_bounds_llvm(0, 1) test_out_of_bounds_llvm(1, 1) test_out_of_bounds_llvm(10000, 0) test_out_of_bounds_llvm(0, 10000) test_out_of_bounds_llvm(10000, 10000) # lower bound test_out_of_bounds_llvm(-1, 0) test_out_of_bounds_llvm(0, -1)
sch[conv].unroll(rwi) in_cache = sch.cache_read(image, 'global', [conv]) sch[in_cache].compute_at(sch[conv], w) axis = sch[in_cache].fuse(in_cache.op.axis[3], in_cache.op.axis[4]) sch[in_cache].vectorize(axis) #sch[conv].parallel(h) sch[conv].reorder(n, c0, h, rh, c1o, rco, rwo, w, rwi, c1i, rci) sch[conv].pragma(c1i, 'vnni') print(tvm.lower(sch, [image, kernel, conv], simple_mode=True)) answer_ref = tvm.build(sch, [image, kernel, conv]) import vnni with tvm.build_config(add_lower_pass=[(1, vnni.vnni_transformation)]): print(tvm.lower(sch, [image, kernel, conv], simple_mode=True)) module = tvm.build(sch, [image, kernel, conv], target='llvm -mcpu=cascadelake') shapes = [i.shape for i in [image, kernel]] shapes = [list(map(lambda x: x.value, i)) for i in shapes] out_shape = list(map(lambda x: x.value, conv.shape)) types = ['int8', 'int8', 'int32'] args = [ tvm.ndarray.array(np.random.randint(0, 127, i, j)) for i, j in zip(shapes, types) ] out = tvm.ndarray.array(np.zeros(out_shape).astype('int32')) ans = tvm.ndarray.array(np.zeros(out_shape).astype('int32'))
def dp4a(x_scope='local', y_scope='local', z_scope='local'): """ Int8 dot product reduced by every 4 elements using __dp4a Parameters ---------- x_scope : str, optional The storage scope of buffer for lhs y_scope : str, optional The storage scope of buffer for rhs z_scope : str, optional The storage scope of buffer for result Returns ------- intrin : TensorIntrin The dp4a TensorIntrin that can be used in tensorizing schedule. """ n = 4 # dp4a requires operands packed by 4 x = tvm.placeholder((n, ), name='x', dtype='int8') y = tvm.placeholder((n, ), name='y', dtype='int8') k = tvm.reduce_axis((0, n), name='rc') z = tvm.compute( (1, ), lambda i: tvm.sum(x[k].astype('int32') * y[k].astype('int32'), axis=[k])) def _intrin_func(ins, outs): def _instr(index): xx, yy = ins zz = outs[0] if index == 1: return zz.vstore(0, 0) ib = tvm.ir_builder.create() vec_x = xx.vload(0, dtype='int8x4') vec_y = yy.vload(0, dtype='int8x4') prev_z = 0 if index == 0 else zz.vload(0) new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) ib.emit(zz.vstore(0, new_z)) return ib.get() return _instr(0), _instr(1), _instr(2) # body, reset, update with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: scopes = {x: x_scope, y: y_scope, z: z_scope} binds = { t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, data_alignment=cfg.data_alignment, offset_factor=cfg.offset_factor, scope=scopes[t]) for t in [x, y, z] } return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
tvm.make.For(j, 0, 8, 3, 0, tvm.make.Store(Ab.data, tvm.make.Load(dtype, Ab.data, i) + 1, j + 1))) assert isinstance(stmt, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True) assert not isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True) assert isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False) assert isinstance(ret, tvm.stmt.For) assert ret.for_type == tvm.stmt.For.Unrolled if __name__ == "__main__": with tvm.build_config(dump_pass_ir=True): test_unroll_loop() def end_with(*suffix): ends = suffix def run(s): f = map(s.endswith, ends) if True in f: return s return run file_list = os.listdir('./') cc_file = end_with('.cc') cc_file = filter(cc_file, file_list) assert len(cc_file) == 3 for i in cc_file: os.remove(i)
from __future__ import absolute_import, print_function import tvm import numpy as np tgt_host = "llvm" # tgt="llvm" tgt = "c" n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") print(type(C)) s = tvm.create_schedule(C.op) bx, tx = s[C].split(C.op.axis[0], factor=64) with tvm.build_config(dump_pass_ir=True): fadd = tvm.build(s, [A, B, C], tgt, name="myadd") print(fadd.get_source()) print("finished.")
def intrinsic_gemm(i, j, k, il, jl, kl, ic, jc, kc): """ (i, k) * (k, j) i, j, k: normal iteration size il, jl, kl: last iteration size ic, jc, kc: last iteration condition """ assert i * k + k * j <= 256 * 1024, 'input too large for scratchpad' assert 4 * (i * j) <= 64 * 1024, 'input too large for accumulator' a = tvm.placeholder((i, k), name='a', dtype=dtype) b = tvm.placeholder((k, j), name='b', dtype=dtype) kk = tvm.reduce_axis((0, k), name='k') c = tvm.compute((i, j), lambda ii, jj: tvm.sum(a[ii, kk] * b[kk, jj], axis=kk), name='c') strideA = tvm.var("sA") Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[strideA, 1]) strideB = tvm.var("sB") Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[strideB, 1]) strideC = tvm.var("sC") Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[strideC, 1]) II = i // DIM + (0 if i % DIM == 0 else 1) JJ = j // DIM + (0 if j % DIM == 0 else 1) KK = k // DIM + (0 if k % DIM == 0 else 1) pad_I = 0 if i % DIM == 0 else (DIM - i % DIM) pad_J = 0 if j % DIM == 0 else (DIM - j % DIM) pad_K = 0 if k % DIM == 0 else (DIM - k % DIM) IIl = il // DIM + (0 if il % DIM == 0 else 1) JJl = jl // DIM + (0 if jl % DIM == 0 else 1) KKl = kl // DIM + (0 if kl % DIM == 0 else 1) pad_Il = 0 if il % DIM == 0 else (DIM - il % DIM) pad_Jl = 0 if jl % DIM == 0 else (DIM - jl % DIM) pad_Kl = 0 if kl % DIM == 0 else (DIM - kl % DIM) II = tvm.if_then_else(ic, IIl, II) JJ = tvm.if_then_else(jc, JJl, JJ) KK = tvm.if_then_else(kc, KKl, KK) pad_I = tvm.if_then_else(ic, pad_Il, pad_I) pad_J = tvm.if_then_else(jc, pad_Jl, pad_J) pad_K = tvm.if_then_else(kc, pad_Kl, pad_K) # reset-update-finalize def intrin_func(ins, outs): aa, bb = ins cc, = outs def _body(): ib = tvm.ir_builder.create() # int32_t matmul_kernel(const elem_t *A, const elem_t *B, const acc_t *D, # elem_t *C, int32_t I, int32_t J, int32_t K, int32_t pad_I, # int32_t pad_J, int32_t pad_K, int32_t A_row_len, # int32_t B_row_len, int32_t D_row_len, int32_t C_row_len, # bool no_bias, bool repeating_bias); # D is set to a dummy address 1 to determine whether to overwrite # accumulator contents: on the first run, 1 will be retained and # overwrite the value in the accumulator; on subsequent runs D will be # replaced by NULL and C will accumulate on top of the accumulator's contents # This is controlled via bit 1 << (ADDR_LEN - 2) - see kernel source ib.emit( tvm.call_extern("int32", "matmul_kernel", aa.access_ptr("r"), bb.access_ptr("r"), 1, cc.access_ptr("rw"), II, JJ, KK, pad_I, pad_J, pad_K, strideA, strideB, 0, strideC, True, False)) return ib.get() def _reset(): ib = tvm.ir_builder.create() # int32_t matmul_reset(elem_t *C, int32_t I, int32_t J, int32_t pad_I, # int32_t pad_J, int32_t C_row_len); ib.emit( tvm.call_extern("int32", "matmul_reset", cc.access_ptr("w"), II, JJ, pad_I, pad_J, strideC)) return ib.get() def _finalize(): ib = tvm.ir_builder.create() # Move out C from accumulator # int32_t matmul_finalize(elem_t *C, int32_t I, int32_t J, int32_t pad_I, # int32_t pad_J, int32_t C_row_len); ib.emit( tvm.call_extern("int32", "matmul_finalize", cc.access_ptr("rw"), II, JJ, pad_I, pad_J, strideC)) return ib.get() # standalone (without reduce axis split), reset, update return None, _reset(), _body(), _finalize() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={ a: Ab, b: Bb, c: Cb }, name="sp_gemm")
po, co, pi, ci = s[tiled_buf].op.axis if (fc >= 8): outter, co = s[tiled_buf].split(co, fc // 8) else: outter, co = s[tiled_buf].split(co, 1) c1, c2 = s[tiled_buf].split(outter, kw) ph, pwo = s[tiled_buf].split(po, pw // 8) s[tiled_buf].reorder(co, pwo, ph, pi, c1, c2, ci) s[tiled_buf].pragma(pwo, 'nnpu.im2col') # s[feature_buf].pragma(s[feature_buf].leaf_iter_vars[0], env.dma_copy_to_buf) # s[tiled].pragma(s[tiled].leaf_iter_vars[0], env.dma_copy_from_buf) # pw = s[tiled].fuse(pwo, pi) # s[tiled].reorder(co, ph, pw, c1, c2, ci) from nnpu import ir_pass pass_list = [(2, ir_pass.im2col_transform)] with tvm.build_config(add_lower_pass=pass_list): print(tvm.lower(s, [feature, tiled], simple_mode=True)) func = tvm.build(s, [feature, tiled], 'llvm', 'llvm', 'im2col_func') a_np = np.random.randint(size=(fh, fw, fc), dtype='int8', low=-128, high=127) a_nd = tvm.nd.array(a_np) gt_nd = tvm.nd.array( np.zeros((packed_shape[0] // 8, packed_shape[1] // 8, 8, 8), dtype='int8')) gt_func(a_nd, gt_nd) real_nd = tvm.nd.array( np.zeros((packed_shape[0] // 8, packed_shape[1] // 8, 8, 8), dtype='int8')) func(a_nd, real_nd)