def test_annotate_none(): ctx1 = tvm.context(1) ctx2 = tvm.context(2) x = relay.var("x", shape=(3,)) y = relay.var("y", shape=(3,)) z = relay.var("z", shape=(3,)) def annotated(): add = relay.add(x, y) sub = relay.subtract(add, z) func = relay.Function([x, y, z], sub) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, ctx1.device_type) return func def expected(): add = relay.add(x, y) sub = relay.subtract(add, z) func = relay.Function([x, y, z], sub) return func annotated_func = relay.ir_pass.infer_type(annotated()) expected_func = relay.ir_pass.infer_type(expected()) assert relay.ir_pass.alpha_equal(annotated_func, expected_func)
def test_annotate_all(): ctx1 = tvm.context(1) ctx2 = tvm.context(2) x = relay.var("x", shape=(3,)) y = relay.var("y", shape=(3,)) z = relay.var("z", shape=(3,)) def annotated(): add = relay.add(x, y) _add = relay.annotation.on_device(add, ctx2) sub = relay.subtract(add, z) _sub = relay.annotation.on_device(sub, ctx2) func = relay.Function([x, y, z], relay.Tuple(tvm.convert([_add, _sub, sub]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, ctx1.device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[2]), func.body[2]) def expected(): add = relay.add(x, y) sub = relay.subtract(add, z) func = relay.Function([x, y, z], sub) return func annotated_func = relay.ir_pass.infer_type(annotated()) expected_func = relay.ir_pass.infer_type(expected()) assert relay.ir_pass.alpha_equal(annotated_func, expected_func)
def test_compile_engine(): engine = relay.backend.compile_engine.get() def get_func(shape): x = relay.var("x", shape=shape) y = relay.add(x, x) z = relay.add(y, x) f = relay.ir_pass.infer_type(relay.Function([x], z)) return f z1 = engine.lower(get_func((10,)), "llvm") z2 = engine.lower(get_func((10,)), "llvm") z3 = engine.lower(get_func(()), "llvm") assert z1.same_as(z2) assert not z3.same_as(z1) if tvm.context("cuda").exist: z4 = engine.lower(get_func(()), "cuda") assert not z3.same_as(z4) # Test JIT target for target in ["llvm"]: ctx = tvm.context(target) if ctx.exist: f = engine.jit(get_func((10,)), target) x = tvm.nd.array(np.ones(10).astype("float32"), ctx=ctx) y = tvm.nd.empty((10,), ctx=ctx) f(x, y) tvm.testing.assert_allclose( y.asnumpy(), x.asnumpy() * 3) engine.dump()
def test_fuse_all(device, tgt): """Fuse all operators.""" fallback_device = tvm.context("cpu") target = {"cpu": "llvm", device: tgt} cpu_ctx = fallback_device dev_ctx = tvm.context(device) def annotated(): add = relay.add(x, y) _add = relay.annotation.on_device(add, dev_ctx) sqrt = relay.sqrt(add) _sqrt = relay.annotation.on_device(sqrt, dev_ctx) log = relay.log(add) _log = relay.annotation.on_device(log, dev_ctx) subtract = relay.subtract(sqrt, log) _subtract = relay.annotation.on_device(subtract, dev_ctx) exp = relay.exp(subtract) _exp = relay.annotation.on_device(exp, dev_ctx) func = relay.Function([x, y], relay.Tuple(tvm.convert([_add, _sqrt, _log, _subtract, _exp, exp]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, cpu_ctx.device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[5]), func.body[5]) annotated_func = annotated() expected_func = get_func() check_annotated_graph(annotated_func, expected_func) test_runtime(target, device, annotated_func, fallback_device)
def test_dynamic_tensor(): dtype = 'float32' stype = 'csr' target = 'llvm' ctx = tvm.context(target, 0) nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n') A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype) assert(A.stype == 'csr') C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter') s = tvm.create_schedule(C.op) _nr, _nc = 3, 5 a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.) a = tvmsp.array(a, ctx) assert a.data.dtype == a.dtype Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr']) Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data') Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices') binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data.shape[0], a.data, c.data) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
def test_double_splitting_with_indivisible_factors(): m = 48 dtype="float32" A = tvm.placeholder((m,), name='A', dtype=dtype) C = tvm.compute((m,), lambda i: A[i], name='C') D = tvm.compute((m,), lambda i: C[i], name='D') s = tvm.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = 'llvm' with tvm.build_config(partition_const_loop=True): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) # Find the beginning of the Halide IR corresponding to kernel code # and make sure it doesn't have an if statements left top_produce = find_top_produce(f.body) assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) # check functional correctness of generated code ctx = tvm.context(target, 0) a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx) c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_group_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) s = topi.cpp.cuda.schedule_injective(target, [C]) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) elif typ == "pow": out_npy = lhs_npy ** rhs_npy else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): if device == 'llvm': out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False) indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk) else: out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False) indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk) s = topi.generic.schedule_nms(out) indices_s = topi.generic.schedule_nms(indices_out) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f = tvm.build(s, [data, valid_count, out], device) f(tvm_data, tvm_valid_count, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) f = tvm.build(indices_s, [data, valid_count, indices_out], device) f(tvm_data, tvm_valid_count, tvm_indices_out) tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), (dilation, dilation), layout='NCHW%dc'%ic_block, out_layout="NCHW%dc"%oc_block, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
def test_multiple_kernels(): N = 1024 A = tvm.placeholder((N, N), name='A') B = tvm.compute((N, N), lambda i, j: A[i, j]) C = tvm.compute((N, N), lambda i, j: B[i, j]) s = tvm.create_schedule([C.op]) s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x")) s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))]}): tvm.build(s, [A, C], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))]}): tvm.build(s, [A, C], target) assert valid[0]
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_elemwise(B) k_ = 2 foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ) a_npy = np.random.uniform(size=shape).astype(A.dtype) if typ == "add": b_npy = a_npy + k_ elif typ == "sub": b_npy = a_npy - k_ elif typ == "mul": b_npy = a_npy * k_ elif typ == "div": b_npy = a_npy / k_ else: raise NotImplementedError() a_nd = tvm.nd.array(a_npy, ctx) b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) foo(a_nd, b_nd, k_, *shape) tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
def test_local_memory(): N = 1024 M = 128 A = tvm.placeholder((N,), name='A', dtype='float32') B = tvm.compute((N, ), lambda i: A[i], name='B') s = tvm.create_schedule([B.op]) AA = s.cache_read(A, "local", [B]) o, i = s[B].split(s[B].op.axis[0], M) s[AA].compute_at(s[B], o) s[B].bind(o, tvm.thread_axis("blockIdx.x")) # local memory usage: M * 4B # thread usage: M for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_local_memory_per_block=4 * M - 1, max_threads_per_block=1))]}): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_local_memory_per_block=4 * M, max_threads_per_block=1))]}): tvm.build(s, [A, B], target) assert valid[0]
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Device %s" % device) f = tvm.build(s, [A, B, C], device) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) for i in range(2): f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) num_flops = 2 * nn * nn * nn num_runs = 10 timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) t = timer_f(a, b, c).mean GFLOPS = num_flops / (t * 1e3) / 1e6 print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_broadcast(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy else: raise NotImplementedError() lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return temp = util.tempdir() name = "myadd_%s" % device if sys.platform == "darwin" or sys.platform.startswith('linux'): f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name) elif sys.platform == "win32": f = tvm.build(s, [A, B], device, "llvm", name=name) else: raise ValueError("Unsupported platform") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) f1 = tvm.module.load(path_dso) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) f1(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) if sys.platform != "win32": f2 = tvm.module.system_lib() f2[name](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def _impl_v1(cls, inputs, attr, params): if 'shape' in attr: return _op.reshape(inputs[0], attr['shape']) if get_name(inputs[1]) in params: shape = tuple(params[inputs[1].name_hint].asnumpy()) out = _op.reshape(inputs[0], shape) else: # Try to infer shape by precompute prune if possible. # TODO: good to check inputs to be in params. # to be enhanced when relay support list_input_names API of NNVM logging.warning("Infering Reshape argument by precompute") func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1]) with tvm.relay.build_config(opt_level=0): graph, lib, params = tvm.relay.build(func, target="llvm", params=params) ctx = tvm.context("llvm", 0) from tvm.contrib import graph_runtime m = graph_runtime.create(graph, lib, ctx) m.set_input(**params) m.run() params_new = m.get_output(0) inputs.pop(1) out = _op.reshape(inputs[0], tuple(params_new.asnumpy().astype('int32').flatten())) return out
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_broadcast(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__) if lhs_shape is None: lhs_npy = float(np.random.uniform(low=lhs_min, high=lhs_max)) if dtype.startswith('int'): lhs_npy = int(lhs_npy) lhs_nd = lhs_npy else: lhs_npy = np.random.uniform(low=lhs_min, high=lhs_max, size=lhs_shape).astype(A.dtype) lhs_nd = tvm.nd.array(lhs_npy, ctx) if rhs_shape is None: rhs_npy = float(np.random.uniform(low=rhs_min, high=rhs_max)) if dtype.startswith('int'): rhs_npy = int(rhs_npy) rhs_nd = rhs_npy else: rhs_npy = np.random.uniform(low=rhs_min, high=rhs_max, size=rhs_shape).astype(A.dtype) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_npy = fnumpy(lhs_npy, rhs_npy) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx) foo(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
def annotated(): conv2d_1 = relay.nn.conv2d( data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_1 = relay.annotation.on_device(conv2d_1, dev2) conv2d_2 = relay.nn.conv2d( data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_2 = relay.annotation.on_device(conv2d_2, dev2) add = relay.add(conv2d_1, conv2d_2) _add = relay.annotation.on_device(add, dev1) conv2d_3 = relay.nn.conv2d( add, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2) func = relay.Function([data1, data2, weight], relay.Tuple(tvm.convert([_conv2d_1, _conv2d_2, _conv2d_3, _add, conv2d_3]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, tvm.context(3).device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[4]), func.body[4])
def ctx_list(): """Get context list for testcases""" device_list = os.environ.get("NNVM_TEST_TARGETS", "") device_list = (device_list.split(",") if device_list else ["llvm", "cuda"]) device_list = set(device_list) res = [(device, tvm.context(device, 0)) for device in device_list] return [x for x in res if x[1].exist]
def check_device(device, host="stackvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) np.testing.assert_allclose( b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
def test_fuse_log_add(device, tgt): """ Only log and add are fused.""" fallback_device = tvm.context("cpu") target = {"cpu": "llvm", device: tgt} cpu_ctx = fallback_device dev_ctx = tvm.context(device) def annotated(): add = relay.add(x, y) sqrt = relay.sqrt(add) _sqrt = relay.annotation.on_device(sqrt, dev_ctx) log = relay.log(add) subtract = relay.subtract(sqrt, log) exp = relay.exp(subtract) _exp = relay.annotation.on_device(exp, dev_ctx) func = relay.Function([x, y], relay.Tuple(tvm.convert([_sqrt, _exp, exp]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, cpu_ctx.device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[2]), func.body[2]) def expected(): add = relay.add(x, y) copy_add_sqrt = relay.device_copy(add, cpu_ctx, dev_ctx) sqrt = relay.sqrt(copy_add_sqrt) log = relay.log(add) copy_sqrt_subtract = relay.device_copy(sqrt, dev_ctx, cpu_ctx) subtract = relay.subtract(copy_sqrt_subtract, log) copy_sub_exp = relay.device_copy(subtract, cpu_ctx, dev_ctx) exp = relay.exp(copy_sub_exp) func = relay.Function([x, y], exp) return func annotated_func = annotated() expected_func = expected() ctx = tvm.context(device, 0) dev_idx = ctx.device_type expected_index = [1, 1, 1, dev_idx, dev_idx, 1, 1, dev_idx, dev_idx] check_annotated_graph(annotated_func, expected_func) test_runtime(target, device, annotated_func, fallback_device, expected_index)
def check_eval(expr, args, expected_result, mod=None, rtol=1e-07): if mod is None: mod = relay.Module() ctx = tvm.context("llvm", 0) intrp = create_executor(mod=mod, ctx=ctx, target="llvm") result = intrp.evaluate(expr)(*args) np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--target', type=str, required=True, choices=['cuda', 'rocm', 'opencl', 'metal'], help="Compilation target.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.") parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter ctx = tvm.context(args.target, 0) batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size,) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload( batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) if args.target == "cuda": unroll = 1400 else: unroll = 128 with nnvm.compiler.build_config(opt_level=opt_level): with tvm.build_config(auto_unroll_max_step=unroll, unroll_explicit=(args.target != "cuda")): graph, lib, params = nnvm.compiler.build( net, args.target, shape={"data": data_shape}, params=params) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(args.repeat): prof_res = ftimer() print(prof_res) # sleep for avoiding device overheat if i + 1 != args.repeat: time.sleep(45)
def check_device(device, host="stackvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), ctx) a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) func(a0, a1, c) tvm.testing.assert_allclose( c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()), rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.schedule_injective(target, [B]) else: s = topi.cpp.cuda.schedule_injective(target, [B]) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B], device, name="tranpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype) out_npy = data_npy.transpose(axes) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def check_device(device, host="llvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx) d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx) fadd(a, b, c, d) tvm.testing.assert_allclose( d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): # declare DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter, (stride_h, stride_w), padding_args, (dilation, dilation), in_layout, out_layout, dtype) # TODO: add scale_shift implement for NCHWc and add test here Relu = topi.nn.relu(DepthwiseConv2d) # schedule s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d) s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Relu], device) # Prepare pod type for test data closure input_shape = (batch, in_channel, in_height, in_width) filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride, padding) relu_scipy = np.maximum(depthwise_conv2d_scipy, 0) return (_transform_data(input_np, ic_block), _transform_kernel(filter_np, oc_block), _transform_data(depthwise_conv2d_scipy, oc_block), _transform_data(relu_scipy, oc_block)) # Get the test data (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data() input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) f1(input_tvm, filter_tvm, depthwise_conv2d_tvm) # launch kernel 2 (depthwise_conv2d + relu) f2(input_tvm, filter_tvm, relu_tvm) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
def check_eval(expr, args, expected_result, mod=None, rtol=1e-07): # TODO(tqchen) add more types once the schedule register is fixed. for target in ["llvm"]: ctx = tvm.context(target, 0) if not ctx.exist: return intrp = create_executor(mod=mod, ctx=ctx, target=target) result = intrp.evaluate(expr)(*args) # use tvm.testing which also set atol tvm.testing.assert_allclose( result.asnumpy(), expected_result, rtol=rtol)
def main(): ctx = tvm.context('cpu', 0) model = tvm.module.load(osp.join(CWD, 'build', 'enclave.signed.so')) inp = tvm.nd.array(np.ones((1, 3, 224, 224), dtype='float32'), ctx) out = tvm.nd.array(np.empty((1, 1000), dtype='float32'), ctx) model(inp, out) if abs(out.asnumpy().sum() - 1) < 0.001: print('It works!') else: print('It doesn\'t work!') exit(1)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if default_schedule: device += " -libs=cudnn" print("Running on target: %s" % device) with tvm.target.create(device): # declare # (algo = 0) DepthwiseConv2d = topi.cuda.depthwise_conv2d.depthwise_conv2d_cuda( autotvm.get_config(), Input, Filter, (stride_w, stride_h), (1, 1), dilation=1, algo=cudnn_algo ) if default_schedule else topi.nn.depthwise_conv2d_nhwc( Input, Filter, stride=[stride_h, stride_w], padding=padding, dilation=1) # ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift) # Relu = topi.nn.relu(ScaleShift) s1 = topi.cuda.depthwise_conv2d.schedule_depthwise_conv2d_nchw_cuda( autotvm.get_config(), [DepthwiseConv2d] ) if default_schedule else schedule_depthwise_conv2d_nhwc_reuse( [DepthwiseConv2d], Input) # s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d) # s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift) # s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu) # s3 = schedule_depthwise_conv2d_nhwc_reuse(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device, name="DepthwiseConv2d_%d_%d" % (in_height, in_width)) # f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) # f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare pod type for test data closure dtype = Input.dtype input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) # scale_shape = get_const_tuple(Scale.shape) # shift_shape = get_const_tuple(Shift.shape) # scale_shift_shape = get_const_tuple(ScaleShift.shape) # prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) # scale_tvm = tvm.nd.array(scale_np, ctx) # shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=dtype), ctx) # scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=dtype), ctx) # relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=dtype), ctx) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) # timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=10) # tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) # timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=10) # tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean # relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), output_np, rtol=1e-5) # np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) # np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print( "Depthwise convolution: average running time is {:.2f} us.".format( tcost_1 * 1e6))
def test_Alexnet(): def Conv(data, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)): if pad[0] != 0 or pad[1] != 0: data = nnvm.symbol.pad(data=data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) datas = nnvm.symbol.conv2d(data=data, kernel_size=kernel_size, channels=filter_nums, strides=stride, layout='NHWC', kernel_layout='HWOI') datas = nnvm.symbol.relu(data=datas) return datas def get_symbol(datas, num_classes): conv1 = Conv(data=datas, kernel_size=(11, 11), filter_nums=96, stride=(4, 4)) pool1 = nnvm.symbol.max_pool2d(data=conv1, pool_size=(3, 3), strides=(2, 2), layout='NHWC') conv2 = Conv(data=pool1, kernel_size=(5, 5), filter_nums=256, pad=(2, 2)) pool2 = nnvm.symbol.max_pool2d(data=conv2, pool_size=(3, 3), strides=(2, 2), layout='NHWC') conv3 = Conv(data=pool2, kernel_size=(3, 3), filter_nums=384, pad=(1, 1)) conv4 = Conv(data=conv3, kernel_size=(3, 3), filter_nums=384, pad=(1, 1)) conv5 = Conv(data=conv4, kernel_size=(3, 3), filter_nums=256, pad=(1, 1)) pool3 = nnvm.symbol.max_pool2d(data=conv5, pool_size=(3, 3), strides=(2, 2), layout='NHWC') datas = nnvm.symbol.flatten(data=pool3) fc1 = nnvm.symbol.dense(data=datas, units=1024) relu1 = nnvm.symbol.relu(data=fc1) drop1 = nnvm.symbol.dropout(data=relu1, rate=0.5) fc2 = nnvm.symbol.dense(data=drop1, units=1024) relu2 = nnvm.symbol.relu(data=fc2) drop2 = nnvm.symbol.dropout(data=relu2, rate=0.5) fc3 = nnvm.symbol.dense(data=drop2, units=16) symbol = nnvm.symbol.softmax(fc3) return symbol input_shape = (1, 128, 128, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 1 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) print(compute_graph.ir()) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='SC') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.randint(size=input_shape, low=-32, high=32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) # module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)
def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, activation_bits, weight_bits, unipolar): in_height = in_width = in_size input_type = 'uint32' out_dtype = 'int16' device = 'llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon' with tvm.target.create(device): A = te.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A') W = te.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W') B = topi.arm_cpu.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, 'uint8', out_dtype, unipolar) s = topi.arm_cpu.schedule_bitserial_conv2d_nhwc([B]) func = tvm.build(s, [A, W, B], device) assembly = func.get_source('asm') matches = re.findall("vpadal", assembly) assert (len(matches) > 0) matches = re.findall("vcnt", assembly) assert (len(matches) > 0) matches = re.findall("vpadd", assembly) assert (len(matches) > 0) ctx = tvm.context(device, 0) if 'arm' not in os.uname()[4]: print("Skipped running code, not an arm device") return print("Running on target: %s" % device) def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) if unipolar: w_ = np.copy(w_np).astype(out_dtype) for x in np.nditer(w_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype) else: b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--target', type=str, required=True, choices=['cuda', 'rocm', 'opencl', 'metal'], help="Compilation target.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.") parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter ctx = tvm.context(args.target, 0) batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload(batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): with tvm.build_config(auto_unroll_max_step=128, unroll_explicit=(args.target != "cuda")): graph, lib, params = nnvm.compiler.build( net, args.target, shape={"data": data_shape}, params=params) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(args.repeat): prof_res = ftimer() print(prof_res) # sleep for avoiding device overheat if i + 1 != args.repeat: time.sleep(45)
from nnvm import compiler from nnvm.frontend import from_mxnet from tvm.contrib.download import download from tvm.contrib import graph_runtime from mxnet.model import load_checkpoint from utils import save_tvm_params, save_tvm_graph from load_deploy_model import load_mxnet_model from time import time tgt_host = "llvm" # Change it to respective GPU if gpu is enabled Ex: cuda, opencl tgt = "llvm" ctx = tvm.context(tgt, 0) target = 'llvm' shapes = dict() shapes['cls_prob'] = (1, 21, 5186) shapes['loc_preds'] = (1, 20744) shapes['anchor_boxes'] = (1, 5186, 4) net = load_nms('model/deploy_ssd_inceptionv3_512-nms-symbol') net, params = nnvm.frontend.from_mxnet(net) print("[*] Compile...")
################################################################# # Compile and Evaluate # -------------------- # After auto-tuning, we can compile the network with the best schedules we found. # All measurement records are dumped into the log file during auto-tuning, # so we can read the log file and load the best schedules. # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Create graph runtime ctx = tvm.context(str(target), 0) module = graph_runtime.GraphModule(lib["default"](ctx)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) ################################################################# # Other Tips # ---------- # 1. During the tuning, the auto-scheduler needs to compile many programs and
def test_db_filter(): logging.info("test db filter ...") # Pick a GPU target because there are more likely to be failures/invalid configs task, target = get_sample_task() ctx = tvm.context(str(target)) if not ctx.exist: logging.warning( "Skip this test because there is no supported device for test") batch_size = 2 measure_option = autotvm.measure_option(mode='local-nofork', timeout=2) measure_batch = autotvm.measure.create_measure_batch(task, measure_option) ct = 0 all_inputs = list() all_results = list() batches = list() tuner = autotvm.tuner.RandomTuner(task) while ct < TRIAL_LIMIT: inputs = list() for i in range(batch_size): cfg = tuner.next_batch(1)[0] inputs.append((MeasureInput(target, task, cfg))) all_inputs.append(inputs[-1]) batches.append(inputs) results = measure_batch(inputs) all_results += results ct += 1 del measure_batch db = database.DummyDatabase() db.flush() # First setting, memoize one input at a time, check that each is saved and replayed measure_option = autotvm.measure_option(mode='local-nofork', timeout=2, replay_db=db) measure_batch = autotvm.measure.create_measure_batch(task, measure_option) for i in range(len(all_inputs) + 1): db.flush() for j in range(i): db.save(all_inputs[j], all_results[j]) for k in range(len(batches)): batch = batches[k] batch_result = measure_batch(batch) for l in range(batch_size): all_idx = k * batch_size + l assert batch_result[l] is not None if all_idx < i: assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \ "(no retry) EXPECTED MATCH, GOT MISMATCH" else: assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \ "(no retry) EXPECTED MISMATCH, GOT MATCH" del measure_batch
ll_path = temp.relpath("temp.ll") # Create LLVM ir from c source code ll_code = clang.create_llvm(cc_code, output=ll_path) return ll_code ###################################################################### # Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline. # The importing needs to happen before the tensorized GEMV being executed. # s[C].pragma(x, "import_llvm", gemv_impl()) func = tvm.build(s, [A, B, C], target="llvm", name="gemv") from topi.util import get_const_tuple dtype = A.dtype ctx = tvm.context("cpu", 0) a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype) b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx) func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3) ###################################################################### # We compare the tensorize version with that :code:`numpy.dot` produces, # ensure our implementation is correct. # # Reduce-update for Tensorize # ------------------------------------ # Let's then move one step forward. # Assume our accelerator could only multiply a vector by a square matrix, # in which the vector size needs to be no larger than 16.
def test_densenet(): def Conv(datas, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)): if pad[0] != 0 or pad[1] != 0: datas = nnvm.symbol.pad(data=datas, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) conv = nnvm.symbol.conv2d(data=datas, kernel_size=kernel_size, channels=filter_nums, strides=stride, layout='NHWC', kernel_layout='HWOI') return conv def bottleneck_layer(datas, filters): bn1 = nnvm.symbol.batch_norm(data=datas, epsilon=2e-5, axis=3) relu1 = nnvm.symbol.relu(data=bn1) conv1 = Conv(datas=relu1, kernel_size=(1, 1), filter_nums=4 * filters) bn2 = nnvm.symbol.batch_norm(data=conv1, epsilon=2e-5, axis=3) relu2 = nnvm.symbol.relu(data=bn2) conv2 = Conv(datas=relu2, kernel_size=(3, 3), filter_nums=filters, pad=(1, 1)) return conv2 def transition_layer(datas, filters): conv = Conv(datas=datas, kernel_size=(1, 1), filter_nums=filters) pool = nnvm.symbol.avg_pool2d(data=conv, pool_size=(2, 2), strides=(2, 2), layout='NHWC') return pool def dense_block(datas, filters, layers): layers_concat = [] layers_concat.append(datas) b_l = bottleneck_layer(datas, filters) layers_concat.append(b_l) for i in range(layers - 1): x = nnvm.symbol.concatenate(*layers_concat, axis=3) x = bottleneck_layer(x, filters) layers_concat.append(x) return x def get_symbol(datas, num_classes=16): x = Conv(datas, kernel_size=(7, 7), filter_nums=32, stride=(2, 2)) x = nnvm.symbol.max_pool2d(x, pool_size=(3, 3), strides=(2, 2), layout='NHWC') b1 = dense_block(x, 32, 6) l1 = transition_layer(b1, 32) b2 = dense_block(l1, 32, 12) l2 = transition_layer(b2, 32) b3 = dense_block(l2, 32, 48) l3 = transition_layer(b3, 32) b4 = dense_block(l3, 32, 32) x = nnvm.symbol.global_avg_pool2d(data=b4, layout='NHWC') x = nnvm.symbol.flatten(data=x) fc = nnvm.symbol.dense(data=x, units=16) symbol = nnvm.symbol.softmax(data=fc) return symbol input_shape = (1, 229, 229, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 3 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=input_shape) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean)
def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None): def tvm_val_2_py_val(val): val = tvm.ir_pass.Substitute(val, var_dict) val = tvm.ir_pass.Simplify(val) assert isinstance(val, (tvm.expr.IntImm, tvm.expr.UIntImm)) return val.value ctx = tvm.context(target, 0) op = None if sch is None: outs = func(*tuple( tvm.convert(i) if isinstance(i, list) else i for i in args)) op = outs[0].op if isinstance(outs, list) else outs.op sch = tvm.create_schedule(op) else: assert outs is not None assert isinstance(outs, list) op = outs[0].op emu_args = [] nd_args = [] for i in args: if isinstance(i, tvm.tensor.Tensor): shape = [tvm_val_2_py_val(j) for j in i.shape] emu_args.append(numpy.random.randn(*shape).astype(i.dtype)) nd_args.append(tvm.nd.array(emu_args[-1], ctx)) elif isinstance(i, tvm.expr.Var): emu_args.append(tvm_val_2_py_val(i)) nd_args.append(emu_args[-1]) else: assert isinstance(i, list) emu_args.append(numpy.array(i)) compile_args = [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var))] + \ (outs if isinstance(outs, list) else [outs]) module = tvm.build(sch, compile_args, target=target) assert module out_tensors = [] for i in range(op.num_outputs): output = op.output(i) shape = [tvm_val_2_py_val(j) for j in output.shape] nd_args.append( tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx)) out_tensors.append(nd_args[-1]) ref_data = func(*emu_args) if isinstance(ref_data, numpy.ndarray): ref_data = [ref_data] module(*nd_args) for nd, np in zip(out_tensors, ref_data): tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5) module_args = [ i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var)) ] module_outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs h_module = tvm.hybrid.build(sch, module_args, module_outs) return h_module, module_args, module_outs
import tvm.topi.testing from tvm.topi.util import get_const_int, get_const_tuple from tvm.topi.nn.bitserial_util import bitpack, binary_op_multiplier from tvm.topi.nn import bitserial_dense from tvm.topi.x86 import schedule_reduce M = 1024 K = 1024 N = 1024 dtype = "uint32" target = 'llvm -mcpu=core-avx2' # target = 'llvm' ctx = tvm.context(target, 0) _bitserial_dense_implement = { "generic": (topi.nn.bitserial_dense, topi.generic.schedule_bitserial_dense), "cpu": (topi.x86.bitserial_dense, topi.x86.schedule_bitserial_dense), "arm_cpu": (topi.arm_cpu.bitserial_dense, topi.arm_cpu.schedule_bitserial_dense), } a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx) b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx) c = tvm.nd.array(np.zeros((M, N), dtype='int16'), ctx) # numpy matrix multi np_repeat = 100
def check_device(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel_size, kernel_size), name='W') out_dtype = 'float32' wkl = _get_workload(A, W, stride, padding, out_dtype) sch = Im2ColPack(7, 8, 1, 8, True) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() # device = 'llvm' device = 'llvm -mcpu=skylake-avx512' ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): B = _im2col_pack(wkl, sch, A, W, stride, padding, out_dtype) s = tvm.create_schedule(B.op) traverse(s, B.op) op = B.op output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data_col = data_vec.op.input_tensors[0] data = data_col.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] _schedule_im2col_conv2d(wkl, sch, s, data, data_pad, data_col, data_vec, kernel, kernel_vec, conv_out, output, B) print(tvm.lower(s, [A, W, B], simple_mode=True)) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a, w, b).mean print('conv: %g secs/op' % cost) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print(b_np.shape)
def run_unpropagatable_graph(dev, tgt): R""" The network is as following: a b c d \ / \ / add mul \ / subtract """ a = relay.var("a", shape=(10, 10)) b = relay.var("b", shape=(10, 10)) c = relay.var("c", shape=(10, 10)) d = relay.var("d", shape=(10, 10)) a_data = np.random.rand(10, 10).astype('float32') b_data = np.random.rand(10, 10).astype('float32') c_data = np.random.rand(10, 10).astype('float32') d_data = np.random.rand(10, 10).astype('float32') tmp_add = a_data + b_data tmp_mul = np.multiply(c_data, d_data) ref_res = np.subtract(tmp_add, tmp_mul) fallback_device = tvm.context("cpu") target = {"cpu": "llvm", dev: tgt} cpu_ctx = fallback_device dev_ctx = tvm.context(dev) def annotated(): add = relay.add(a, b) _add = relay.annotation.on_device(add, dev_ctx) mul = relay.multiply(c, d) _mul = relay.annotation.on_device(mul, cpu_ctx) sub = relay.subtract(_add, _mul) _sub = relay.annotation.on_device(sub, dev_ctx) func = relay.Function([a, b, c, d], _sub) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, dev_ctx.device_type) return func def expected(): add = relay.add(a, b) mul = relay.multiply(c, d) copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx) sub = relay.subtract(add, copy_mul_sub) func = relay.Function([a, b, c, d], sub) return func annotated_func = annotated() expected_func = expected() expected_index = [2, 2, 2, 1, 1, 1, 2, 2] check_annotated_graph(annotated_func, expected_func) params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data} config = {"opt_level": 0} config["fallback_device"] = fallback_device with relay.build_config(**config): graph, lib, params = relay.build(annotated_func, target, params=params) contexts = [tvm.cpu(0), tvm.context(dev)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index mod = graph_runtime.create(graph, lib, contexts) mod.set_input(**params) mod.run() res = mod.get_output(0).asnumpy() tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
def check_device(device, target_device): if not tvm.module.enabled(target_device): print("Skip test because {} is not enabled.".format(target_device)) return device_ctx = tvm.context(device) graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type) shape = (4, ) # Insert copy nodes for data transferring between add and sub nodes. # Transfers data from gpu to cpu. copy_add_sub = tvm.placeholder(shape, name="__copy0") # Transfers data from cpu to gpu. copy_sub_add = tvm.placeholder(shape, name="__copy1") # Create a module containing adds on the device. tensor_a = tvm.placeholder(shape, name="A") tensor_b = tvm.placeholder(shape, name="B") tensor_d = tvm.placeholder(shape, name="D") elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add0") elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i) + tensor_d(*i), name="elemwise_add1") target = topi.cpp.TEST_create_target(device) add_schedule0 = topi.cpp.cuda.schedule_injective( target, [elemwise_add0]) lower_add0 = tvm.lower(add_schedule0, [tensor_a, tensor_b, elemwise_add0], name="elemwise_add0") add_schedule1 = topi.cpp.cuda.schedule_injective( target, [elemwise_add1]) lower_add1 = tvm.lower(add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], name="elemwise_add1") # Create module for sub whose target is the host. tensor_c = tvm.placeholder(shape, name="C") elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i) - tensor_c(*i), name="elemwise_sub") sub_schedule = tvm.create_schedule(elemwise_sub.op) lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c, elemwise_sub], name="elemwise_sub") target_flist = { target_device: [lower_add0, lower_add1], target_host: [lower_sub] } mhost = tvm.build(target_flist, target_host=target_host) ctx = [host_ctx, device_ctx] params = {} params["A"] = tensor_a = np.random.uniform(size=shape).astype( tensor_a.dtype) params["B"] = tensor_b = np.random.uniform(size=shape).astype( tensor_b.dtype) params["C"] = tensor_c = np.random.uniform(size=shape).astype( tensor_c.dtype) params["D"] = tensor_d = np.random.uniform(size=shape).astype( tensor_d.dtype) def check_verify(): mod = graph_runtime.create(graph, mhost, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) def check_load_module(): temp = util.tempdir() path_lib = temp.relpath("deploy.so") mhost.export_library(path_lib) with open(temp.relpath("deploy.json"), "w") as out_file: out_file.write(graph) loaded_lib = tvm.module.load(path_lib) loaded_graph = open(temp.relpath("deploy.json")).read() mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) check_verify() check_load_module()
def test_conv_network(): R""" The network is as following: data1 data2 | | conv2d conv2d \ / add | conv2d """ batch_size = 1 dshape = (batch_size, 64, 56, 56) weight = relay.var("weight", shape=(64, 64, 3, 3)) data1 = relay.var("data1", shape=dshape) data2 = relay.var("data2", shape=dshape) dev1 = tvm.context(1) dev2 = tvm.context(2) def original(): conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) conv2d_2 = relay.nn.conv2d(data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) add = relay.add(conv2d_1, conv2d_2) conv2d_3 = relay.nn.conv2d(add, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) func = relay.Function([data1, data2, weight], conv2d_3) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, tvm.context(3).device_type) return func def annotated(): conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_1 = relay.annotation.on_device(conv2d_1, dev2) conv2d_2 = relay.nn.conv2d(data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_2 = relay.annotation.on_device(conv2d_2, dev2) add = relay.add(_conv2d_1, _conv2d_2) _add = relay.annotation.on_device(add, dev1) conv2d_3 = relay.nn.conv2d(_add, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2) func = relay.Function([data1, data2, weight], _conv2d_3) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, tvm.context(3).device_type) return func class ScheduleConv2d(ExprMutator): def __init__(self, device): self.device = device super().__init__() def visit_call(self, expr): visit = super().visit_call(expr) if expr.op == tvm.relay.op.get("nn.conv2d"): return relay.annotation.on_device(visit, self.device) else: return visit def annotate_with_visitor(func): sched = ScheduleConv2d(dev2) func = sched.visit(func) func = relay.ir_pass.rewrite_annotated_ops(func, dev1.device_type) return func def expected(): conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) device_copy1 = relay.device_copy(conv2d_1, dev2, dev1) conv2d_2 = relay.nn.conv2d(data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) device_copy2 = relay.device_copy(conv2d_2, dev2, dev1) add = relay.add(device_copy1, device_copy2) device_copy3 = relay.device_copy(add, dev1, dev2) conv2d_3 = relay.nn.conv2d(device_copy3, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) func = relay.Function([data1, data2, weight], conv2d_3) return func def check_storage_and_device_types(): func = annotated() func = relay.ir_pass.rewrite_annotated_ops(func, 3) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.fuse_ops(func, opt_level=2) func = relay.ir_pass.infer_type(func) smap = relay.backend._backend.GraphPlanMemory(func) storage_ids = [] device_types = [] for _, storage_dev_type in smap.items(): assert len(storage_dev_type) == 2 for sid in storage_dev_type[0]: storage_ids.append(sid.value) for did in storage_dev_type[1]: device_types.append(did.value) assert len(storage_ids) == 10 assert len(set(storage_ids)) == 8 assert len(set(device_types)) == 2 assert set(device_types) == {1, 2} def test_manual_annotation(): annotated_func = annotated() expected_func = expected() check_annotated_graph(annotated_func, expected_func) check_storage_and_device_types() def test_visitor_annotation(): annotated_func = annotate_with_visitor(original()) expected_func = expected() check_annotated_graph(annotated_func, expected_func) test_manual_annotation() test_visitor_annotation()
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) task = autotvm.task.create(schedule_depthwise_conv2d_nhwc_reuse_auto, args=(batch, in_channel, in_size, channel_multiplier, kernel, stride), target="cuda") print(task) print(task.config_space) # logging config (for printing tuning log to the screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler( logging.StreamHandler(sys.stdout)) # There are two steps for measuring a config: build and run. # By default, we use all cpu cores to compile program. Then measure them sequentially. # We measure 5 times and take average to reduce variance. measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=10)) tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=25, measure_option=measure_option, callbacks=[ autotvm.callback.log_to_file( 'depthwise_conv2d_nhwc_{}.log'.format(in_size)) ]) with autotvm.apply_history_best( 'depthwise_conv2d_nhwc_{}.log'.format(in_size)): with tvm.target.create(device): s1, [Input, Filter, DepthwiseConv2d ] = schedule_depthwise_conv2d_nhwc_reuse_auto( batch, in_channel, in_size, channel_multiplier, kernel, stride) # s3 = schedule_depthwise_conv2d_nhwc_reuse(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device, name="ddd%dddd" % in_size) # f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) # f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare pod type for test data closure dtype = Input.dtype input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) # scale_shape = get_const_tuple(Scale.shape) # shift_shape = get_const_tuple(Shift.shape) # scale_shift_shape = get_const_tuple(ScaleShift.shape) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) # scale_np = np.random.uniform(size=scale_shape).astype(dtype) # shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) # scale_shift_scipy = np.zeros(shape=scale_shift_shape) # for c in range(in_channel * channel_multiplier): # scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c] # relu_scipy = np.maximum(scale_shift_scipy, 0) # return (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) return (input_np, filter_np, depthwise_conv2d_scipy) # Get the test data # (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data() (input_np, filter_np, depthwise_conv2d_scipy) = get_ref_data() # prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) # scale_tvm = tvm.nd.array(scale_np, ctx) # shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) # scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) # relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=10) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) # timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=10) # tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) # timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=10) # tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean # relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) # np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) # np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print( "Depthwise convolution: average running time is {:.2f} us.".format( tcost_1 * 1e6))
def check_eval(expr, expected_result, mod=None, rtol=1e-07): ctx = tvm.context("llvm", 0) intrp = create_executor(mod=mod, ctx=ctx, target="llvm") result = intrp.evaluate(expr) np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32") for in_dtype in ["float32", "float64"]: data = np.random.randn(10, 10).astype(in_dtype) check_cumsum(np.cumsum(data), data) check_cumsum(np.cumsum(data, axis=0), data, axis=0) check_cumsum(np.cumsum(data, axis=1), data, axis=1) data = np.random.randn(10, 5, 10).astype(in_dtype) check_cumsum(np.cumsum(data), data) check_cumsum(np.cumsum(data, axis=0), data, axis=0) check_cumsum(np.cumsum(data, axis=1), data, axis=1) check_cumsum(np.cumsum(data, axis=-1), data, axis=-1) for in_dtype in ["int32", "int64"]: data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype) check_cumsum(np.cumsum(data, dtype=in_dtype), data) check_cumsum(np.cumsum(data), data, dtype="int64") check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0) check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1) data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype) check_cumsum(np.cumsum(data), data, dtype="int64") if __name__ == "__main__": test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm")) test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda")) test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))
def test_propogation(): R""" The network and device type is as following: x 1 | log 1 / \ log2 log10 2 \ / add 2 | tan 1 """ ctx1 = tvm.context(1) ctx2 = tvm.context(2) expected_dev_type = { "log": ctx1, "log2": ctx2, "log10": ctx2, "add": ctx2, "tan": ctx1 } x = relay.var("x", shape=(3, )) def annotated(): log = relay.log(x) _log = relay.annotation.on_device(log, expected_dev_type["log"]) log2 = relay.log2(_log) _log2 = relay.annotation.on_device(log2, expected_dev_type["log2"]) log10 = relay.log10(_log) _log10 = relay.annotation.on_device(log10, expected_dev_type["log10"]) add = relay.add(_log2, _log10) _add = relay.annotation.on_device(add, expected_dev_type["add"]) tan = relay.tan(_add) _tan = relay.annotation.on_device(tan, expected_dev_type["tan"]) func = run_opt_pass(_tan, transform.RewriteAnnotatedOps(ctx1.device_type)) return func def expected(): log = relay.log(x) _log_left = relay.device_copy(log, ctx1, ctx2) _log_right = relay.device_copy(log, ctx1, ctx2) log2 = relay.log2(_log_left) log10 = relay.log10(_log_right) add = relay.add(log2, log10) _add = relay.device_copy(add, ctx2, ctx1) tan = relay.tan(_add) func = run_opt_pass(tan, transform.InferType()) return func annotated_expr = annotated() expected_expr = expected() assert tvm.ir.structural_equal(annotated_expr, expected_expr) smap = relay.backend._backend.GraphPlanMemory(annotated_expr) for expr, storage_dev_type in smap.items(): # x is ctx1 as output is ctx1 if isinstance(expr, tvm.relay.expr.Var): assert storage_dev_type[1][0] == ctx1.device_type else: # device_copy op should be its dst_dev_type if isinstance(expr.attrs, tvm.relay.op.op_attrs.DeviceCopyAttrs): assert storage_dev_type[1][0] == expr.attrs.dst_dev_type else: assert storage_dev_type[1][0] == expected_dev_type[ expr.op.name].device_type
A = tvm.placeholder((rA, cA), dtype='float16') B = tvm.placeholder((rB, cB), dtype='float16') C = tvm.placeholder((rA, rB), dtype='float32') #C = tvm.placeholder((rA,rB),dtype = 'float16') assert (cA == cB) D = tvm.compute((rA//(16*block_tiles),rB//(16*block_tiles),(num_thread*block_warp),1),\ lambda i,j,w,warp:(A[i*16*block_tiles+(w//(num_thread)//block_row_warp)*16*warp_col_tile,0]+\ B[j*16*block_tiles+(w//(num_thread)%block_row_warp)*16*warp_row_tile,0]+\ C[i*16*block_tiles+(w//num_thread//block_row_warp)*16*warp_col_tile,j*16*block_tiles+(w//num_thread%block_row_warp)*16*warp_row_tile].astype('float16'))) s = schedule_gemm_fp16() print(tvm.lower(s, [A, B, C], name="matrix_dot", simple_mode=True)) f = tvm.build(s, [A, B, C], target='cuda', name='gemm_fp16') print("build finished") ctx = tvm.context('cuda', 4) a_np = np.float16(np.random.uniform(0., 1., size=(rA, cA))) b_np = np.float16(np.random.uniform(0., 1., size=(rB, cB))) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((rA, rB), dtype=C.dtype), ctx) #ci = tvm.nd.array(np.ones((rA//16, rB//16,16,16), dtype=C.dtype), ctx) f(a, b, c) ss = c.asnumpy() np.testing.assert_allclose(c.asnumpy(),\ np.dot(np.float32(a_np),\ np.float32(np.transpose(b_np))), rtol=1e-3)
def verify(target="llvm -mcpu=skylake-avx512"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return ctx = tvm.context(target, 0) X = tvm.placeholder((m, k), name='X', dtype="uint8") W = tvm.placeholder((n, k), name='W', dtype="int8") pc = dot_16x1x16_int8_int8_int16() ak = tvm.reduce_axis((0, k), name='k') packedW = tvm.placeholder((n / 128, 128 * (k / 2), 2), name='packedW', dtype="int8") t_fc = tvm.compute( (m, n), lambda i, j: tvm.sum(X[i, ak].astype("int16") * packedW[j / 128, ( ak / 2) * 128 + j % 128, ak % 2].astype("int16"), axis=ak), name="F") t_sch = tvm.create_schedule(t_fc.op) a_x, a_y = t_fc.op.axis a_k, = t_fc.op.reduce_axis a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128) a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2) a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128) a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32) t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki) t_sch[t_fc].tensorize(a_yi, pc) # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True)) t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic") t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10) # generate the plain data a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8") b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8") packW = np.random.uniform(1, 10, size=(n / 128, 128 * (k / 2), 2)).astype("int8") # This occurs in pre_compute stage for r_idx in range(n / 128): for s_idx in range(128 * (k / 2)): for t_idx in range(2): packW[r_idx][s_idx][t_idx] = b_[r_idx * 128 + s_idx % 128][s_idx / 128 * 2 + t_idx] x = tvm.nd.array(a_, ctx) w = tvm.nd.array(packW, ctx) y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx) result = t_evaluator(x, w, y) gops_per_sec = gops_per_mm / result.mean / 1e9 tvm.testing.assert_allclose(y.asnumpy(), np.dot(a_, b_.T), rtol=1e-5) print( 'Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.' .format(result.mean * 1000, gops_per_sec, gops_per_sec / peak)) t_func.export_library("gemm_tensorize.o")
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) task = autotvm.task.create(schedule_conv2d_nhwc_auto, args=(batch, in_channel, in_size, num_filter, kernel, stride), target="cuda") print(task.config_space) # logging config (for printing tuning log to the screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler( logging.StreamHandler(sys.stdout)) # There are two steps for measuring a config: build and run. # By default, we use all cpu cores to compile program. Then measure them sequentially. # We measure 5 times and take average to reduce variance. measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=10)) tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=25, measure_option=measure_option, callbacks=[ autotvm.callback.log_to_file( 'conv2d_nhwc_{}.log'.format(in_size)) ]) with autotvm.apply_history_best('conv2d_nhwc_{}.log'.format(in_size)): with tvm.target.create(device): s, [A, W, B] = schedule_conv2d_nhwc_auto(batch, in_channel, in_size, num_filter, kernel, stride) func = tvm.build(s, [A, W, B], device, name=("ddd%dddd" % in_size)) @memoize("verify_nhwc") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(b_np.shape), dtype=dtype), ctx) func(a, w, b) timer_1 = func.time_evaluator(func.entry_name, ctx, number=10) tcost_1 = timer_1(a, w, b).mean np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print("1x1 convolution: average running time is {:.2f} us.".format( tcost_1 * 1e6))
def check_mod(mod, x_np, res_np): target = "vulkan" ctx = tvm.context(target, 0) ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) res = ex.evaluate()(x_np).asnumpy() tvm.testing.assert_allclose(res, res_np, atol=1e-5)
def test_simplex_data_transferring(): r""" Test the heterogeneous execution of a simple network where data transferring is from the target device to the host processor at runtime. The host processor is always assumed to be cpu, and the device varies. """ host = "cpu" target_host = "llvm" host_ctx = tvm.context(host) if not tvm.module.enabled(target_host): print("Skip test because llvm is not enabled.") return def check_device(device, target_device): if not tvm.module.enabled(target_device): print("Skip test because {} is not enabled.".format(target_device)) return device_ctx = tvm.context(device) graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type) shape = (4, ) # Create module for add whose target is the device. tensor_a = tvm.placeholder(shape, name="A") tensor_b = tvm.placeholder(shape, name="B") elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add") target = topi.cpp.TEST_create_target(device) schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add]) lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], name="elemwise_add") # Insert copy. Neither compute nor schedule is required for the copy # node. The compute will be performed at runtime which is just data # copy from the input to the output. tensor_copy = tvm.placeholder(shape, name="__copy") # Create module for sub whose target is the host. tensor_c = tvm.placeholder(shape, name="C") elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i) - tensor_c(*i), name="elemwise_sub") schedule_sub = tvm.create_schedule(elemwise_sub.op) lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c, elemwise_sub], name="elemwise_sub") target_flist = {target_device: [lower_add], target_host: [lower_sub]} mhost = tvm.build(target_flist, target_host=target_host) ctx = [host_ctx, device_ctx] mod = graph_runtime.create(graph, mhost, ctx) params = {} params["A"] = tensor_a = np.random.uniform(size=shape).astype( tensor_a.dtype) params["B"] = tensor_b = np.random.uniform(size=shape).astype( tensor_b.dtype) params["C"] = tensor_c = np.random.uniform(size=shape).astype( tensor_c.dtype) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal(out.asnumpy(), (tensor_a + tensor_b) - tensor_c) dev_tar = {"cuda": "cuda", "opencl": "opencl"} for device, target in dev_tar.items(): with tvm.target.create(device): check_device(device, target)
"--print_loss_val_each_epoch", help="Print loss value at the end of each epoch", action="store_true") args = parser.parse_args() models = [] executor_ctx = None print_loss_val_each_epoch = False if args.model == "logreg": models = [mnist_logreg] elif args.model == "mlp": models = [mnist_mlp] elif args.model == "all": models = [mnist_logreg, mnist_mlp] if args.executor_context == "cpu": tgt = "llvm" tgt_host = "llvm" elif args.executor_context == "gpu": tgt = "cuda" tgt_host = "llvm" assert False, "cuda codegen not ready" # create context object executor_ctx = tvm.context(tgt, 0) print_loss_val_each_epoch = True if args.print_loss_val_each_epoch \ else False num_epochs = args.num_epoch for m in models: m(executor_ctx, num_epochs, print_loss_val_each_epoch)
def compare_tflite_with_tvm(in_data, in_name, input_tensors, output_tensors, init_global_variables=False, out_names=None, quantized=False): """Generic function to generate and compare TFLite and TVM output""" in_data = convert_to_list(in_data) in_name = convert_to_list(in_name) out_names = convert_to_list(out_names) in_node = [0] * len(in_name) for i in range(len(in_name)): in_node[i] = in_name[i].split( ':')[0] if ":" in in_name[i] else in_name[i] with tf.Session() as sess: if init_global_variables: sess.run(variables.global_variables_initializer()) # convert to tflite model converter = interpreter_wrapper.TFLiteConverter.from_session( sess, input_tensors, output_tensors) if quantized: converter.inference_type = tf.lite.constants.QUANTIZED_UINT8 input_arrays = converter.get_input_arrays() input_stats = {} # hardcode the mean_values and std_dev_values (m,s) to be the same for all inputs # s = 255/(fmax-fmin); m = -fmin*s (the zero point) for i in input_arrays: input_stats[i] = (128., 1.275) converter.quantized_input_stats = input_stats tflite_model_buffer = converter.convert() tflite_output = run_tflite_graph(tflite_model_buffer, in_data) for device in ["llvm"]: ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) continue tvm_output = run_tvm_graph(tflite_model_buffer, in_data, in_node, target=device, num_output=len(out_names), out_names=out_names) if quantized: for i in range(len(tflite_output)): # allow absolute tolerance of 1 in the quantized results tvm.testing.assert_allclose(tflite_output[i], tvm_output[i], atol=1, rtol=1e-5) else: for i in range(len(tflite_output)): tvm.testing.assert_allclose(tflite_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
def test_num_thread(): N = 1024 M = 128 A = te.placeholder((N, ), name='A', dtype='float32') B = te.compute((N, ), lambda i: A[i], name='B') s = te.create_schedule([B.op]) o, i = s[B].split(s[B].op.axis[0], M) s[B].bind(o, te.thread_axis('threadIdx.x')) s[B].bind(i, te.thread_axis("threadIdx.y")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.transform.PassContext( config={ "tir.add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))] }): tvm.build(s, [A, B], target) assert not valid[0] with tvm.transform.PassContext( config={ "tir.add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))] }): tvm.build(s, [A, B], target) assert valid[0] with tvm.transform.PassContext( config={ "tir.add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N, max_thread_y=M - 1))] }): tvm.build(s, [A, B], target) assert not valid[0] with tvm.transform.PassContext( config={ "tir.add_lower_pass": [( 2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N, max_thread_y=M))] }): tvm.build(s, [A, B], target) assert valid[0]
def search_common( task=None, target="llvm", search_policy="sketch", runner="local", num_measure_trials=100, cost_model=auto_scheduler.RandomModel(), init_search_callbacks=None, ): if task is None: task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(64, 64, 64), target=target) target = task.target print("Test search policy '%s' for '%s'" % (search_policy, target)) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name init_search_callbacks = init_search_callbacks or [] init_search_callbacks.append( auto_scheduler.PreloadMeasuredStates(log_file)) if search_policy == "empty": search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == "sketch": search_policy = auto_scheduler.SketchPolicy( task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks) else: raise ValueError("Invalid policy: " + search_policy) # Tune tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, num_measures_per_round=2, early_stopping=1, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), CustomMeasureCallback() ], ) task.tune(tuning_options=tuning_options, search_policy=search_policy) # Compile with the best schedule sch, args = task.apply_best(log_file) mod = tvm.build(sch, args, target) # Compile with naive schedule for correctness check sch, args = task.compute_dag.apply_steps_from_state( task.compute_dag.init_state) mod_ref = tvm.build(sch, args, "llvm") ctx = tvm.context(str(target), 0) np_arrays = [ np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args ] tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays] mod(*tvm_arrays) actual = [x.asnumpy() for x in tvm_arrays] tvm_arrays = [tvm.nd.array(x) for x in np_arrays] mod_ref(*tvm_arrays) expected = [x.asnumpy() for x in tvm_arrays] for x, y in zip(actual, expected): tvm.testing.assert_allclose(x, y, rtol=1e-5)
def check_device(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') out_dtype = 'float32' wkl, sch_default = _spatial_get_sch(A, W, stride, padding, out_dtype) sch = sch_default if schedule is None else schedule a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() # device = 'llvm' device = 'llvm -mcpu=skylake-avx512' ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): print('--- schedule data packing ---') A_vec, s = _spatial_pack_data_only(wkl, sch, A) print(A_vec.shape) a_vec_shape = get_const_tuple(A_vec.shape) a_vec = tvm.nd.array(np.zeros(a_vec_shape, dtype=dtype), ctx) print(tvm.lower(s, [A, A_vec], simple_mode=True)) func = tvm.build(s, [A, A_vec], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a, a_vec).mean print('data -> data_vec: %g secs/op' % cost) print('--- schedule kernel packing ---') W_vec, s = _spatial_pack_kernel_only(wkl, sch, W) print(W_vec.shape) w_vec_shape = get_const_tuple(W_vec.shape) w_vec = tvm.nd.array(np.zeros(w_vec_shape, dtype=dtype), ctx) # print(tvm.lower(s, [W, W_vec], simple_mode=True)) func = tvm.build(s, [W, W_vec], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(w, w_vec).mean print('kernel -> kernel_vec: %g secs/op' % cost) print('--- schedule conv & unpack ---') A_vec = tvm.placeholder(a_vec_shape, name='A_vec') W_vec = tvm.placeholder(w_vec_shape, name='W_vec') B, s = _spatial_conv_only(wkl, sch, A_vec, W_vec, out_dtype=dtype) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) # print(tvm.lower(s, [A_vec, W_vec, B], simple_mode=True)) func = tvm.build(s, [A_vec, W_vec, B], target=device) func.save('conv_unpack.asm') time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost = time_f(a_vec, w_vec, b).mean print('conv & unpack: %g secs/op' % cost) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print(b_np.shape)
(56, 56, 256, 128, 1, 1, 0, 0, 2, 2), (28, 28, 128, 512, 1, 1, 0, 0, 1, 1), (56, 56, 256, 512, 1, 1, 0, 0, 2, 2), (28, 28, 512, 128, 1, 1, 0, 0, 1, 1), (28, 28, 512, 256, 1, 1, 0, 0, 2, 2), (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1), (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2), (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1), (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2), (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1), (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2), (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1)] TARGET_NAME = 'llvm -mcpu=skylake-avx512' NUM_VEC_LANES = 16 CTX = tvm.context(TARGET_NAME, 0) def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype): """ Finds out the shape of all data structures """ ## Find shapes data_shape = (1, in_filter // NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES) if out_dtype == 'int32': kernel_shape = (out_filter // NUM_VEC_LANES, in_filter // NUM_VEC_LANES, k_h, k_w, NUM_VEC_LANES // 4, NUM_VEC_LANES, 4)