def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return temp = util.tempdir() name = "myadd_%s" % device if sys.platform == "darwin" or sys.platform.startswith('linux'): f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name) elif sys.platform == "win32": f = tvm.build(s, [A, B], device, "llvm", name=name) else: raise ValueError("Unsupported platform") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) f1 = tvm.module.load(path_dso) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) f1(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) if sys.platform != "win32": f2 = tvm.module.system_lib() f2[name](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_group_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): if device == 'llvm': out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False) indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk) else: out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False) indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk) s = topi.generic.schedule_nms(out) indices_s = topi.generic.schedule_nms(indices_out) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f = tvm.build(s, [data, valid_count, out], device) f(tvm_data, tvm_valid_count, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) f = tvm.build(indices_s, [data, valid_count, indices_out], device) f(tvm_data, tvm_valid_count, tvm_indices_out) tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), (dilation, dilation), layout='NCHW%dc'%ic_block, out_layout="NCHW%dc"%oc_block, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
def _build(funcs, target, target_host): tvm_t = tvm.target.create(target) if tvm_t.device_name == "vta": return tvm.build(funcs, target="ext_dev", target_host=target_host) elif tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu": return tvm.build(funcs, target=target_host) return tvm.build(funcs, target=target)
def test_local_memory(): N = 1024 M = 128 A = tvm.placeholder((N,), name='A', dtype='float32') B = tvm.compute((N, ), lambda i: A[i], name='B') s = tvm.create_schedule([B.op]) AA = s.cache_read(A, "local", [B]) o, i = s[B].split(s[B].op.axis[0], M) s[AA].compute_at(s[B], o) s[B].bind(o, tvm.thread_axis("blockIdx.x")) # local memory usage: M * 4B # thread usage: M for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_local_memory_per_block=4 * M - 1, max_threads_per_block=1))]}): tvm.build(s, [A, B], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_local_memory_per_block=4 * M, max_threads_per_block=1))]}): tvm.build(s, [A, B], target) assert valid[0]
def test_multiple_kernels(): N = 1024 A = tvm.placeholder((N, N), name='A') B = tvm.compute((N, N), lambda i, j: A[i, j]) C = tvm.compute((N, N), lambda i, j: B[i, j]) s = tvm.create_schedule([C.op]) s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x")) s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))]}): tvm.build(s, [A, C], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))]}): tvm.build(s, [A, C], target) assert valid[0]
def main(): n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) s[C].parallel(s[C].op.axis[0]) print(tvm.lower(s, [A, B, C], simple_mode=True)) tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): # declare DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter, (stride_h, stride_w), padding_args, (dilation, dilation), in_layout, out_layout, dtype) # TODO: add scale_shift implement for NCHWc and add test here Relu = topi.nn.relu(DepthwiseConv2d) # schedule s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d) s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Relu], device) # Prepare pod type for test data closure input_shape = (batch, in_channel, in_height, in_width) filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride, padding) relu_scipy = np.maximum(depthwise_conv2d_scipy, 0) return (_transform_data(input_np, ic_block), _transform_kernel(filter_np, oc_block), _transform_data(depthwise_conv2d_scipy, oc_block), _transform_data(relu_scipy, oc_block)) # Get the test data (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data() input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) f1(input_tvm, filter_tvm, depthwise_conv2d_tvm) # launch kernel 2 (depthwise_conv2d + relu) f2(input_tvm, filter_tvm, relu_tvm) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.dylib") f.export_library(path_dso2, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso2) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso1, path_dso2]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU ctx = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride): """ Runs the inference and checks the functional correctness between compute and schedule outputs """ (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype) # Create TVM placeholders data = tvm.placeholder(data_shape, name='data', dtype=data_dtype) kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype) # Create the numpy arrays to be used for executing conv models if data_dtype == 'float32': data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX) kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX) else: data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) # c_orig will be used for declaration ouptut # c_sch will be used for scheduled computation output c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) with tvm.target.create(TARGET_NAME): conv = topi.nn.conv2d_NCHWc(data, kernel, stride=hstride, padding=hpad, layout='NCHWc', out_layout='NCHWc', out_dtype=out_dtype) out = topi.nn.relu(conv) sch = tvm.create_schedule(out.op) func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out') func(data_array, kernel_array, c_orig) LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True)) # Generate and run the optimized schedule sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out]) func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv') func(data_array, kernel_array, c_sch) # Functional check if data_dtype == 'uint8': np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy()) else: assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy()) evaluator = func.time_evaluator(func.entry_name, CTX, number=1000) LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) return evaluator(data_array, kernel_array, c_sch).mean
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return target = topi.cpp.TEST_create_target(device) ctx = tvm.context(device, 0) out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) f = tvm.build(s1, [A, B], device, name="full_like") f(tvm.nd.array(np.zeros(shape, dtype), ctx), out) tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5) f = tvm.build(s2, [C], device, name="full") f(out) tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, activation_bits, weight_bits, unipolar): in_height = in_width = in_size input_type = 'uint32' out_dtype = 'int16' device = 'llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon' with tvm.target.create(device): A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A') W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W') B = topi.nn.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, pack_dtype='uint8', out_dtype='int16', unipolar=unipolar) s = topi.generic.schedule_bitserial_conv2d_nhwc([B]) func = tvm.build(s, [A, W, B], device) assembly = func.get_source('asm') matches = re.findall("vpadal", assembly) assert (len(matches) > 0) matches = re.findall("vcnt", assembly) assert (len(matches) > 0) matches = re.findall("vpadd", assembly) assert (len(matches) > 0) ctx = tvm.context(device, 0) if 'arm' not in os.uname()[4]: print ("Skipped running code, not an arm device") return print("Running on target: %s" % device) def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) if unipolar: w_ = np.copy(w_np).astype(out_dtype) for x in np.nditer(w_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype) else: b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def prepare_test_libs(base_path): n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) # Compile library as dynamic library fadd_dylib = tvm.build(s, [A, B], "llvm", name="addone") dylib_path = os.path.join(base_path, "test_addone_dll.so") fadd_dylib.export_library(dylib_path) # Compile library in system library mode fadd_syslib = tvm.build(s, [A, B], "llvm --system-lib", name="addonesys") syslib_path = os.path.join(base_path, "test_addone_sys.o") fadd_syslib.save(syslib_path)
def build(*args, **kwargs): """Thin wrapper of tvm.build This wrapper automatically applies VTA's build_config if there is no user specified build_config in context. See Also -------- tvm.build : The original TVM's build function """ cfg = tvm.build_module.current_build_config() if not cfg.add_lower_pass: with build_config(): return tvm.build(*args, **kwargs) return tvm.build(*args, **kwargs)
def test_min_repeat_ms(): tmp = tempdir() filename = tmp.relpath("log") @tvm.register_func def my_debug(filename): """one call lasts for 100 ms and writes one character to a file""" time.sleep(0.1) with open(filename, "a") as fout: fout.write("c") X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename)) s = tvm.create_schedule(X.op) func = tvm.build(s, [X]) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) ftimer(x) with open(filename, "r") as fin: ct = len(fin.readline()) assert ct == 2 ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1, min_repeat_ms=1000) ftimer(x) # make sure we get more than 10 calls with open(filename, "r") as fin: ct = len(fin.readline()) assert ct > 10 + 2
def test_double_splitting_with_indivisible_factors(): m = 48 dtype="float32" A = tvm.placeholder((m,), name='A', dtype=dtype) C = tvm.compute((m,), lambda i: A[i], name='C') D = tvm.compute((m,), lambda i: C[i], name='D') s = tvm.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = 'llvm' with tvm.build_config(partition_const_loop=True): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) # Find the beginning of the Halide IR corresponding to kernel code # and make sure it doesn't have an if statements left top_produce = find_top_produce(f.body) assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) # check functional correctness of generated code ctx = tvm.context(target, 0) a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx) c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func1 = tvm.build(s1, [A, W, B], device) func2 = tvm.build(s2, [A, W, C], device) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) s = topi.cpp.cuda.schedule_injective(target, [C]) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) elif typ == "pow": out_npy = lhs_npy ** rhs_npy else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
def test_dynamic_tensor(): dtype = 'float32' stype = 'csr' target = 'llvm' ctx = tvm.context(target, 0) nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n') A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype) assert(A.stype == 'csr') C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter') s = tvm.create_schedule(C.op) _nr, _nc = 3, 5 a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.) a = tvmsp.array(a, ctx) assert a.data.dtype == a.dtype Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr']) Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data') Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices') binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data.shape[0], a.data, c.data) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"): A = tvm.placeholder(shape, dtype=dtype, name="A") C = topi.sum(A, axis=0, keepdims=False) s = tvm.create_schedule(C.op) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return f
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def dump_graph_lib(target_dir): dim = 4 A = tvm.placeholder((dim,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') sched = tvm.create_schedule(B.op) node0 = {"op": "null", "name": "x", "inputs": []} node1 = {"op": "tvm_op", "name": "add", "inputs": [[0, 0, 0]], "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs" : "1", "num_outputs" : "1"}} nodes = [node0, node1] arg_nodes = [0] node_row_ptr = [0, 1, 2] outputs = [[1, 0, 0]] shape = (4,) attrs = { "shape" : ["list_shape", [shape, shape]], "dltype" : ["list_str", ["float32", "float32"]], "storage_id" : ["list_int", [0, 1]], } graph = {"nodes": nodes, "arg_nodes": arg_nodes, "node_row_ptr": node_row_ptr, "heads": outputs, "attrs": attrs} graph = json.dumps(graph) mlib = tvm.build(sched, [A, B], "llvm", name="myadd") mlib.export_library(os.path.join(target_dir, "graph_addone_lib.so")) with open(os.path.join(target_dir, "graph_addone.json"), "w") as fo: fo.write(graph)
def verify(s, check_correctness): mod = tvm.build(s, [data, kernel, res], target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.cpu(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_orig, ctx) kernel_arr = tvm.nd.array(kernel_orig, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, res_arr) res_unpack = res_arr.asnumpy() if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost
def test_log_pow_llvm(): # graph n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.power(tvm.log(A(*i)), 2.0), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. bx, tx = s[B].split(B.op.axis[0], factor=32) # one line to build the function. if not tvm.module.enabled("llvm"): return flog = tvm.build(s, [A, B], "llvm", name="mylog") ctx = tvm.cpu(0) # launch the kernel. n = 1028 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) repeat = 10 ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat) res = ftimer(a, b) assert(len(res.results) == repeat) np.testing.assert_allclose( b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
def test_const_param(): @tvm.hybrid.script def add_something(a, b): c = output_tensor((11, ), 'int32') for i in range(11): c[i] = a[i] + b return c a = tvm.placeholder((11, ), dtype='int32', name='a') b = tvm.const(11, 'int32') c = add_something(a, b) sch = tvm.create_schedule(c.op) module = tvm.build(sch, [a, c], 'llvm') assert(module) np_a = numpy.arange(11).astype('int32') np_b = 11 np_c = numpy.zeros((11, )).astype('int32') nd_a = tvm.ndarray.array(np_a) nd_c = tvm.ndarray.array(numpy.zeros((11, )).astype('int32')) module(nd_a, nd_c) ref = add_something(np_a, 11) tvm.testing.assert_allclose(nd_c.asnumpy(), ref, 1e-5, 1e-5)
def test_upstream(): @tvm.hybrid.script def upstream(a): b = output_tensor((20, ), 'float32') for i in range(20): b[i] = a[i] * i return b a = tvm.placeholder((20, ), 'float32') b = tvm.placeholder((20, ), 'float32') c = tvm.compute((20, ), lambda x: a[x] + b[x]) d = upstream(c) sch = tvm.create_schedule([c.op, d.op]) ir = tvm.lower(sch, [a, b, d], simple_mode=True) func = tvm.build(sch, [a, b, d]) assert(func) a = numpy.random.randn(20).astype('float32') b = numpy.random.randn(20).astype('float32') ref = numpy.zeros((20, ), 'float32') for i in range(20): ref[i] = (a[i] + b[i]) * i tvm_a = tvm.nd.array(a) tvm_b = tvm.nd.array(b) tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32')) func(tvm_a, tvm_b, tvm_d) tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
def test_downstream(): @tvm.hybrid.script def downstream(a): b = output_tensor((20, ), 'float32') for i in range(20): b[i] = a[i] * i return b a = tvm.placeholder((20, ), 'float32') b = downstream(a) c = tvm.compute((20, ), lambda x: b[x] + 1.0) sch = tvm.create_schedule(c.op) module = tvm.build(sch, [a, c]) assert module a = numpy.random.randn(20).astype('float32') ref = numpy.zeros((20, )).astype('float32') for i in range(20): ref[i] = (a[i] * i) + 1.0 tvm_a = tvm.nd.array(a) tvm_c = tvm.nd.array(numpy.zeros((20, )).astype('float32')) module(tvm_a, tvm_c) tvm.testing.assert_allclose(tvm_c.asnumpy(), ref, 1e-5, 1e-5)
def test_value_index(): @tvm.hybrid.script def kernel_a(a): b = output_tensor((16, ), 'int32') c = output_tensor((4, 4), 'int32') for i in range(16): b[i] = a[i] + 2 c[i // 4, i % 4] = a[i] + 1 return b, c @tvm.hybrid.script def kernel_b(b, a): c = output_tensor((4, 4), 'int32') for i in range(4): for j in range(4): c[i, j] = a[i * 4 + j] * b[i, j] return c a = tvm.placeholder((16, ), 'int32') b, c = kernel_a(a) d = kernel_b(c, b) sch = tvm.create_schedule(d.op) module = tvm.build(sch, [a, d]) assert module np_a = numpy.arange(16).astype('int32') np_b, np_c = kernel_a(np_a) ref = kernel_b(np_c, np_b) res = tvm.ndarray.array(numpy.zeros((4, 4)).astype('int32')) module(tvm.ndarray.array(np_a), res) tvm.testing.assert_allclose(res.asnumpy(), ref)
# ----------- # After we have finished specifying the schedule, we can compile it # into a TVM function. By default TVM compiles into a type-erased # function that can be directly called from python side. # # In the following line, we use tvm.build to create a function. # The build function takes the schedule, the desired signature of the # function(including the inputs and outputs) as well as target language # we want to compile to. # # The result of compilation fadd is a GPU device function(if GPU is involved) # that can as well as a host wrapper that calls into the GPU function. # fadd is the generated host wrapper function, it contains reference # to the generated device function internally. # fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") ###################################################################### # Run the Function # ---------------- # The compiled function TVM function is designed to be a concise C API # that can be invoked from any languages. # # We provide an minimum array API in python to aid quick testing and prototyping. # The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard. # # - We first create a gpu context. # - Then tvm.nd.array copies the data to gpu. # - fadd runs the actual computation. # - asnumpy() copies the gpu array back to cpu and we can use this to verify correctness #
def build_and_run(s, tensors, control_f, shape, time_count, count=10, device_id=0, target="llvm", timeout=10.0): """ Build and record the time of running. Args: ----------------------------- s: schedule.Schedule get form the student's auto_schedule tensors (list) the input tensors and the output tensor control_f the torch function shape time_count: used for record the running time count: the number rounds repeat testing device_id : the id of CPU ----------------------------- Returns: ----------------------------- [tvm_time, torch_time]: [float , flaot] which indicates the total time of running scheduled tvm calculation and the total time of running torch calculation ----------------------------- """ # Create ctx. try: ctx = tvm.cpu(device_id) except Exception as e: string = "Can not found device !!!\n" + str(e) time_count.put([string, -1]) return -1 try: output_tensor = tensors[-1] del tensors[-1] except Exception as e: string = "The input is not correct !!!" + str(e) time_count.put([string, -1]) return -1 # Craft input data. try: input_tvm = [] input_torch = [] for tensor in tensors: data = np.random.random( [int(j) for j in tensor.shape]).astype(np.float32) * 100 tvm_data = tvm.nd.array(data, ctx) torch_data = torch.tensor(data) input_tvm.append(tvm_data) input_torch.append(torch_data) output_holder = tvm.nd.array( np.zeros([int(j) for j in output_tensor.shape], dtype=output_tensor.dtype), ctx ) input_tvm = input_tvm + [output_holder] except Exception as e: string = "Can't prepare input data!!!\n" + str(e) time_count.put([string, -1]) return -1 torch_args = [] # TODO use shape length to distinguish conv2d and gemm is foolish # No bias if this is convolution if len(shape) > 8 and shape[8] == 0: torch_args.append(None) torch_args.extend(shape[9:]) # warm-up control_f(*(input_torch + torch_args)) begin = time.time() for i in range(0, count): control_f(*(input_torch + torch_args)) end = time.time() torch_time = (end - begin) * 1e3 / count # Build function form s and tensors. try: func = tvm.build(s, tensors + [output_tensor], target=target) except Exception as e: string = "Can not build successfully !!!" + str(e) time_count.put([string, torch_time]) return -1 signal.signal(signal.SIGALRM, handler) signal.alarm(ceil(timeout)) try: evaluator = func.time_evaluator(func.entry_name, ctx, number=count) tvm_time = evaluator(*input_tvm).mean * 1e3 except TimeoutError: string = "Timeout when evaluating, the limit is {}ms".format(timeout / count * 1e3) time_count.put([string, torch_time]) return -1 except Exception as e: string = "The culation is not correct !!!\n" + str(e) time_count.put([string, torch_time]) return -1 finally: # restore the default handler signal.signal(signal.SIGALRM,signal.SIG_IGN) time_count.put([tvm_time, torch_time]) return 0
bi = 0 if shape2[0] == 1 else x bj = 0 if shape2[1] == 1 else y return A[ai, aj] + B[bi, bj] C = tvm.compute((m, n), f, name='C') return A, B, C m, n = 3, 4 shape1 = (m, 1) shape2 = (m, n) A, B, C = broadcast_add(shape1, shape2) s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, B], simple_mode=True)) mod = tvm.build(s, [A, B, C]) def get_bcast_data(shape1, shape2, constructor=None): """Return random tensors a, b and empty tensor c to store broadcast results between a and b shape1, shape2: shapes of input tensors constructor : user-defined tensor constructor """ np.random.seed(0) a = np.random.normal(size=shape1).astype("float32") b = np.random.normal(size=shape2).astype("float32") out_shape = (shape1[0] if shape2[0] == 1 else shape2[0], shape1[1] if shape2[1] == 1 else shape2[1]) c = np.empty(out_shape, dtype='float32') if constructor:
# Create LLVM ir from c source code ll_code = clang.create_llvm(cc_code, output=ll_path) return ll_code ###################################################################### # Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline. # The importing needs to happen before the tensorized GEMV being executed. # s[C].pragma(x, "import_llvm", gemv_impl()) print(tvm.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # Finally we compare the tensorize version with that :code:`numpy.dot` produces, # ensure our implementation is correct. # func = tvm.build(s, [A, B, C], target="llvm", name="gemv") from topi.util import get_const_tuple dtype = A.dtype ctx = tvm.context("cpu", 0) a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype) b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx) func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3) ###################################################################### # Reduce-update for Tensorize # --------------------------- # So far you have learned the basic idea of tensorize, # now let's move one step forward to a more complicated case.
import os input(os.getpid()) #np.set_printoptions(threshold=np.nan) import sys np.set_printoptions(threshold=np.sys.maxsize) batch_size = 4 data = relay.var("data", relay.TensorType((batch_size, 1000), "float32")) #simple_net = relay.nn.softmax(data) maxelem = relay.nn.softmaxMax(data) #maxelem = relay.reshape(maxelem,(batch_size,)) sumelem = relay.nn.softmaxSum(maxelem, data) simple_net = relay.nn.softmaxDiv(sumelem, maxelem, data) node = relay.analysis.free_vars(simple_net) print("**************test1*************") simple_net = relay.Function(node, simple_net) net, params = testing.create_workload(simple_net) print("----------TEST4----------") tg = "dpu" print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") mod, _ = relay.optimize(net, tg, params) graph0, func0, params0 = graph_runtime_codegen.GraphRuntimeCodegen( None, tg).codegen(mod["main"]) func = tvm.build(func0, tg, name="default_function") print(func.get_source()) #print(tvm.lower(lib,[data, conv1_weight, simple_net], simple_mode=True)) print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
data_expand = tvm.compute( (N, D + 1), lambda n, d: tvm.select( (d < D), data[n, d], tvm.const(1, dtype=data.dtype)), name='data_expand') rd = tvm.reduce_axis((0, D + 1), name='rd') dot = tvm.compute((N, ), lambda n: tvm.sum(weight[rd] * data_expand[n, rd], axis=rd), name='dot') pred = tvm.compute((N, ), lambda n: tvm.select( (dot[n] > 0), tvm.const(1, dtype=label.dtype), tvm.const(-1, dtype=label.dtype)), name='pred') rn = tvm.reduce_axis((0, N), name='rn') err = tvm.compute((1, ), lambda _: tvm.sum(1, rn, label[rn] != pred[rn]), name='err') # === End computation # Scheduling s = tvm.create_schedule([pred.op, err.op]) # Compilation calc = tvm.build(s, [data, label, weight, err]) assert calc
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): # schedule s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d) s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift) s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu) ctx = tvm.context(device, 0) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare pod type for test data closure dtype = Input.dtype input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) scale_shape = get_const_tuple(Scale.shape) shift_shape = get_const_tuple(Shift.shape) scale_shift_shape = get_const_tuple(ScaleShift.shape) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.nchw") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) scale_np = np.random.uniform(size=scale_shape).astype(dtype) shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride=stride, padding=padding) scale_shift_scipy = np.zeros(shape=scale_shift_shape) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) return (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) # Get the test data (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data() input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
name="packedB") C = te.compute( (M, N), lambda x, y: te.sum( A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k), name="C", ) s = te.create_schedule(C.op) xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) (k, ) = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=4) s[C].reorder(xo, yo, ko, xi, ki, yi) s[C].vectorize(yi) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) evaluator = func.time_evaluator(func.entry_name, ctx, number=10) print("Opt4: %f" % evaluator(a, b, c).mean)
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') a_np = np.random.uniform(size=1024).astype(A.dtype) temp = util.tempdir() # Establish remote connection with target hardware tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=60) # Compile the Graph for CPU target s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso_cpu = temp.relpath("cpu_lib.so") f.export_library(path_dso_cpu, ndk.create_shared) # Execute the portable graph on cpu target print('Run CPU test ...') ctx = remote.cpu(0) remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Compile the Graph for OpenCL target if test_opencl: s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") path_dso_cl = temp.relpath("dev_lib_cl.so") f.export_library(path_dso_cl, ndk.create_shared) print('Run GPU(OpenCL Flavor) test ...') ctx = remote.cl(0) remote.upload(path_dso_cl) f1 = remote.load_module("dev_lib_cl.so") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Compile the Graph for Vulkan target if test_vulkan: s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd") path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") f.export_library(path_dso_vulkan, ndk.create_shared) print('Run GPU(Vulkan Flavor) test ...') ctx = remote.vulkan(0) remote.upload(path_dso_vulkan) f1 = remote.load_module("dev_lib_vulkan.so") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
# ------------------------------------------------- # Here we will declare a simple kernel with TVM on the local machine: # n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) ###################################################################### # Then we cross compile the kernel: # # the target here should be 'llvm -target=armv7l-none-linux-gnueabihf', # and we use 'llvm' here to make example run locally, see the detailed # note in the following block f = tvm.build(s, [A, B], target='llvm', name='myadd') # save the lib at local temp folder temp = util.tempdir() path = temp.relpath('mylib.o') f.save(path) ###################################################################### # .. note:: # # the argument :code:`target` in :code:`build` should be replaced # :code:`'llvm'` with the target triple of your device, which might be # different for different device. For example, it is # :code:`'llvm -target=armv7l-none-linux-gnueabihf'` for my Raspberry # Pi. Here we use :code:`'llvm'` directly to make the tutorial runable. # # Usually, you can query the target by execute :code:`gcc -v` on your
def check_verify(): mlib = tvm.build(s, [A, B], "llvm", name="myadd") def myadd(*args): to_return = mlib["myadd"](*args) time.sleep(0.25) return to_return mlib_proxy = tvm.support.FrontendTestModule() mlib_proxy["myadd"] = myadd try: mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n, )).astype(A.dtype) mod.set_input(x=a) # verify dumproot created directory = mod._dump_path assert os.path.exists(directory) # verify graph is there GRAPH_DUMP_FILE_NAME = "_tvmdbg_graph_dump.json" assert len(os.listdir(directory)) == 1 # verify the file name is proper graph_dump_path = os.path.join(directory, GRAPH_DUMP_FILE_NAME) assert os.path.exists(graph_dump_path) # verify the graph contains some expected keys with open(graph_dump_path) as graph_f: dumped_graph = json.load(graph_f) assert isinstance(dumped_graph, dict) for k in ("nodes", "arg_nodes", "node_row_ptr", "heads", "attrs"): assert k in dumped_graph, f"key {k} not in dumped graph {graph!r}" mod.run() # Verify the tensors are dumped assert len(os.listdir(directory)) > 1 debug_lines = mod.debug_datum.get_debug_result().split("\n") def split_debug_line(i): to_return = re.split(r" [ ]*", debug_lines[i]) assert to_return[-1] == "" to_return = to_return[:-1] # strip empty trailing part return to_return assert split_debug_line(0) == [ "Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs", ] myadd_lines = split_debug_line(2) assert myadd_lines[0] == "add" assert myadd_lines[1] == "myadd" runtime_sec = float(myadd_lines[2]) / 1e6 # printed in us # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude. # Here we just care that the prefix is correct. assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000 total_lines = split_debug_line(3) assert total_lines[0] == "Total_time" assert total_lines[2] == myadd_lines[2] CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json" assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME)) with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f: trace = json.load(f) assert trace["displayTimeUnit"] == "ns" events = trace["traceEvents"] assert len(events) == 4 assert all(event["ph"] in ("B", "E") for event in events) assert all(event["pid"] == 1 for event in events) assert all(event["tid"] == 1 for event in events) assert all(event["name"] == "x" for event in events[:2]) assert all(event["name"] == "add" for event in events[2:]) assert events[0]["ts"] == 0 assert events[0]["ph"] == "B" # verify the output is correct out = mod.get_output(0, tvm.nd.empty((n, ))) np.testing.assert_equal(out.numpy(), a + 1) mod.exit() # verify dump root delete after cleanup assert not os.path.exists(directory)
def tune_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, padding, stride, ctx, n_times=1, target_host=None, remote=None): in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=dtype, name='data') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=dtype, name='weight') # get verify data a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) @memoize("topi.tests.test_topi_conv2d.verify_conv2d") def get_ref_data(): a_np = np.random.uniform(size=a_shape) w_np = np.random.uniform(size=w_shape) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() a = tvm.nd.array(a_np.astype(dtype), ctx) w = tvm.nd.array(w_np.astype(dtype), ctx) b = tvm.nd.array(np.zeros(b_np.shape).astype(dtype), ctx) # generate static config #tune_pack = generate_tune_packs([ # ["bn", [4]], # ["num_thread", [1, 2, 4, 8, 16]], # ["unroll_step", [1, 4, 16]], # ]) tune_pack = generate_tune_packs([ ["VH", [1, 2, 4]], ["VW", [1, 2, 4, 8]], ["VC", [1, 2, 4, 8]], ["num_thread", [1, 2, 4, 16, 32, 64]], ]) # search best_cost = 1e9 best_config = None for config in reversed(tune_pack): with tvm.target.mali(): tvm.target.current_target().tune_config = config B = topi.nn.conv2d(A, W, stride, padding) s = topi.generic.schedule_conv2d_nchw([B]) func = tvm.build(s, [A, W, B], target_host=target_host) if remote is not None: func = convert_to_remote(func, remote) time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) cost = time_f(a, w, b).mean try: np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-4) except Exception as e: pass gflops = 2.0 * np.prod( b.shape) * kernel * kernel * in_channel / (1e9) / cost print(config, cost, gflops) if cost < best_cost: best_cost = cost best_config = config return best_cost, 2.0 * np.prod(b.shape) * kernel * kernel * in_channel / ( 1e9) / best_cost, best_config
n, c, h, w = s[A_ch].op.axis hw = s[A_ch].fuse(h, w) co, ci = s[A_ch].split(c, factor=threaddim) hwo, hwi = s[A_ch].split(hw, factor=threaddim * div) hwio, hwii = s[A_ch].split(hwi, factor=div) hwiio, hwiii = s[A_ch].split(hwii, factor=4) s[A_ch].bind(n, block_z) s[A_ch].bind(co, block_y) s[A_ch].bind(hwo, block_x) s[A_ch].bind(ci, thread_y) s[A_ch].bind(hwio, thread_x) s[A_ch].reorder(n, co, hwo, ci, hwio, hwiio, hwiii) s[A_ch].vectorize(hwiii) print(tvm.lower(s, [A, A_ch], simple_mode=True)) func = tvm.build(s, [A, A_ch], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(batch, in_size, in_size, in_channel)).astype(A.dtype) a = tvm.nd.array(a_np, ctx) a_ch = tvm.nd.array( np.zeros((batch, in_channel, in_size, in_size), dtype=A_ch.dtype), ctx) func(a, a_ch) evaluator = func.time_evaluator(func.entry_name, ctx, number=1) conv_time = evaluator(a, a_ch).mean * 1e3 tot_byte = batch * in_channel * in_size * in_size * 4 / 1024 / 1024 / 1024 # GB print('Convolution: %f ms, Bandwidth: %f GB/s' % (conv_time, tot_byte / conv_time * 1000 * 2)) dev_module = func.imported_modules[0] print(dev_module)
shape_size1 = [dim0, dim1] shape_size2 = [dim0, dim2] dtype = "float32" A = tvm.te.placeholder(shape_size1, dtype=dtype, name="A") B = tvm.te.placeholder(shape_size2, dtype=dtype, name="B") C = topi.concatenate([A, B], axis=1) dC = tvm.te.placeholder(C.shape, dtype=dtype, name="dC") dA, dB = tvm.te.mygradient(C, [A, B], dC) s = tvm.te.create_schedule([C.op, dA.op, dB.op]) print(tvm.lower(s, [A, B, dC, dA, dB], simple_mode=True)) func = tvm.build(s, [A, B, dC, dA, dB], target="llvm") A_np = np.random.uniform(-10, 10, shape_size1).astype("float32") B_np = np.random.uniform(-10, 10, shape_size2).astype("float32") dC_np = np.ones([dim0, dim1 + dim2]).astype("float32") dA_np = np.zeros(shape_size1).astype("float32") dB_np = np.zeros(shape_size2).astype("float32") ctx = tvm.context("llvm", 0) A_tvm = tvm.nd.array(A_np, ctx) B_tvm = tvm.nd.array(B_np, ctx) dC_tvm = tvm.nd.array(dC_np, ctx) dA_tvm = tvm.nd.array(dA_np, ctx) dB_tvm = tvm.nd.array(dB_np, ctx)
def search_common( task=None, target="llvm", search_policy="sketch", runner="local", num_measure_trials=100, cost_model=auto_scheduler.RandomModel(), init_search_callbacks=None, ): if task is None: task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(64, 64, 64), target=target) target = task.target print("Test search policy '%s' for '%s'" % (search_policy, target)) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name init_search_callbacks = init_search_callbacks or [] init_search_callbacks.append( auto_scheduler.PreloadMeasuredStates(log_file)) if search_policy == "empty": search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == "sketch": search_policy = auto_scheduler.SketchPolicy( task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks) else: raise ValueError("Invalid policy: " + search_policy) # Tune tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, num_measures_per_round=2, early_stopping=1, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), CustomMeasureCallback() ], ) task.tune(tuning_options=tuning_options, search_policy=search_policy) # Compile with the best schedule sch, args = task.apply_best(log_file) mod = tvm.build(sch, args, target) # Compile with naive schedule for correctness check sch, args = task.compute_dag.apply_steps_from_state( task.compute_dag.init_state) mod_ref = tvm.build(sch, args, "llvm") ctx = tvm.device(str(target), 0) np_arrays = [ np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args ] tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays] mod(*tvm_arrays) actual = [x.numpy() for x in tvm_arrays] tvm_arrays = [tvm.nd.array(x) for x in np_arrays] mod_ref(*tvm_arrays) expected = [x.numpy() for x in tvm_arrays] for x, y in zip(actual, expected): tvm.testing.assert_allclose(x, y, rtol=1e-5)
from tvm import te A = te.placeholder((8, ), dtype="float32", name="A") B = te.compute((8, ), lambda *i: A(*i) + 1.0, name="B") func = te.create_prim_func([A, B]) ir_module_from_te = IRModule({"main": func}) print(ir_module_from_te.script()) ################################################################################################ # Build and Run an IRModule # ------------------------- # We can build the IRModule into a runnable module with specific target backends. # mod = tvm.build(ir_module, target="llvm") # The module for CPU backends. print(type(mod)) ################################################################################################ # Prepare the input array and output array, then run the module. # a = tvm.nd.array(np.arange(8).astype("float32")) b = tvm.nd.array(np.zeros((8, )).astype("float32")) mod(a, b) print(a) print(b) ################################################################################################ # Transform an IRModule # ---------------------
task = tvm.auto_scheduler.create_task(conv_fwd, (N, CI, H, W, CO, ksize, stride, padding, "float32"), target) ### search measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) tune_option = auto_scheduler.TuningOptions( num_measure_trials=num_search_trails, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option) del measure_ctx ### load history # inp, res = auto_scheduler.load_best(log_file, task.workload_key) # sch, args = task.compute_dag.apply_steps_from_state(inp.state) # build func ctx = tvm.gpu() func = tvm.build(sch, args, target, name=func_name) # save result obj_fname = func_name + ".o" ptx_fname = func_name + ".ptx" func.save(obj_fname) func.imported_modules[0].save(ptx_fname) time_end = time.time() print("IterTime: ", (time_end - time_begin)) exit()
def run_pooling(env, remote, wl, target, check_correctness=True, print_ir=False, samples=10): # Workload assertions assert wl.hpad == wl.wpad pool_type = 'max' # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" #pooling_fcompute = topi.arm_cpu.pooling_nchw_spatial_pack pooling_fcompute = topi.nn.pool #pooling_fschedule = topi.arm_cpu.schedule_pooling_nchw_spatial_pack pooling_fschedule = topi.generic.schedule_pool elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) pooling_fcompute = vta.top.pooling_packed pooling_fschedule = vta.top.schedule_pooling_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) # output shape b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = topi.nn.pool(data, kernel=[3, 3], stride=[2, 2], padding=padding, pool_type=pool_type, layout="NCHW") # res = topi.right_shift(res, 8) # res = topi.add(res, bias) # res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) # res = topi.cast(res, env.out_dtype) # Derive base schedule s = pooling_fschedule([res], layout) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # get output shape _, oc, oh, ow = get_const_tuple(res.shape) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter # @memoize("vta.tests.test_benchmark_topi.pooling.verify_nchw") def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) pad_shape = (wl.batch, wl.in_filter, wl.height + wl.hpad * 2, wl.width + wl.wpad * 2) pad_np = np.zeros(shape=pad_shape).astype(data.dtype) no_zero = (range(wl.batch), range(wl.in_filter), (range(wl.hpad, wl.height + wl.hpad)), (range(wl.wpad, wl.width + wl.wpad))) pad_np[np.ix_(*no_zero)] = a_np b_shape = (wl.batch, oc, oh, ow) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) kw, kh = 3, 3 sw, sh = 2, 2 for i in range(oh): for j in range(ow): b_np[:, :, i, j] = np.max(pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw], axis=(2, 3)) b_np = np.maximum(b_np, 0.0) return a_np, pad_np, b_np # Data in original format data_np, _, res_ref = get_ref_data() # Build if "vta" in target.keys: mod = vta.build(s, [data, res], target=target, target_host=env.target_host, name="pooling") else: mod = tvm.build(s, [data, res], target=target, target_host=env.target_host, name="pooling") temp = util.tempdir() mod.save(temp.relpath("pooling.o")) remote.upload(temp.relpath("pooling.o")) f = remote.load_module("pooling.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("pooling", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, res_arr) print(cost) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() res_orig = np.maximum(res_orig, 0.0) res_ref = res_ref.astype(env.out_dtype) res_orig = res_orig.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s POOLING TEST %s: Time cost = %g sec/op" % (device, status, cost.mean)) return correct, cost, stats
######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best("conv2d.log"): with tvm.target.Target("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) dev = tvm.cuda() a_tvm = tvm.nd.array(a_np, device=dev) w_tvm = tvm.nd.array(w_np, device=dev) c_tvm = tvm.nd.empty(c_np.shape, device=dev) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g sec" % tcost_1) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g sec" % tcost_2) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g sec" % tcost_3) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[ c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success")
# The most straight-forward way to call target specific function is via # extern function call construct in tvm. # In th following example, we use :any:`tvm.call_pure_extern` to call # :code:`__expf` function, which is only available under CUDA. # n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern("float32", "__expf", A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "cuda", name="myexp") print(f.imported_modules[0].get_source()) ###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to # call into device specific functions. # However, the above way only works for CUDA target with float type. # Ideally, we want to write same code for any device and any data type. # # TVM intrinsic provides the user a mechanism to achieve this, and this # is the recommended way to solve the problem. # The following code use tvm.exp instead, which create an intrinsic call # :any:`tvm.exp` to do the exponential. #
""" def floormod(a,b): return a - floor(a / b) * b DIM = 1000 HDIM = 500 shape = (DIM,DIM) c_tvm = tvm.nd.array(np.zeros(shape=shape,dtype='int32')) c_np = np.zeros(shape) c = te.compute(shape,lambda i,j: tir.floormod(HDIM - i,j + 1) ) d = te.compute(shape,lambda i,j: tir.floormod(HDIM - i,-(j + 1))) s = te.create_schedule([c.op]) s2 = te.create_schedule([d.op]) f = tvm.build(s,[c]) f2 = tvm.build(s2,[d]) f(c_tvm) out = c_tvm.asnumpy() for i in range(DIM): for j in range(DIM): res = out[i][j] res2 = floormod(HDIM - i, j + 1) if res != res2: print(i,j,res,res2) assert False print("Done half") c_tvm = tvm.nd.array(np.zeros(shape=shape,dtype='int32')) f2(c_tvm)
def _build(funcs, target): return tvm.build(funcs, target=target)
s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a')) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b')) s[Conv].tensorize(nnc, intrin_wmma_store_matrix()) s[ConvF].tensorize(nnf, intrin_wmma_gemm()) print(tvm.lower(s, [A, W, Conv], simple_mode=True)) ############################################################################### # Generate CUDA Kernel # -------------------- # Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution. # Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not # be able to run on our build server ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.build_config(auto_unroll_max_step=16): func = tvm.build(s, [A, W, Conv], 'cuda') a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=10) print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3)) ############################################################################### # Summary # This tutorial demonstrates how TVM scheduling primitives can be used to # call TensorCores on specific GPUs.
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert bn <= N assert num_thread * num_thread * 16 <= N assert num_block * num_block * 2 <= N A = te.placeholder((N, N), name="A") B = te.placeholder((N, N), name="Btmp") k = te.reduce_axis((0, N), name="k") packedB = te.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name="B") C = te.compute( (N, N), lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name="C" ) s = te.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_xz = te.thread_axis((0, 2), "vthread", name="vx") thread_yz = te.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], tvm.target.Target("opencl", host=target), name="gemm_gpu") temp = utils.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) dev = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, dev, N, times)
def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None): def tvm_val_2_py_val(val): val = tvm.tir.stmt_functor.substitute(val, var_dict) val = tvm.arith.Analyzer().simplify(val) assert isinstance(val, (tvm.tir.IntImm, )) return val.value ctx = tvm.context(target, 0) op = None if sch is None: outs = func(*tuple( tvm.runtime.convert(i) if isinstance(i, list) else i for i in args)) op = outs[0].op if isinstance(outs, list) else outs.op sch = te.create_schedule(op) else: assert outs is not None assert isinstance(outs, list) op = outs[0].op emu_args = [] nd_args = [] for i in args: if isinstance(i, te.tensor.Tensor): shape = [tvm_val_2_py_val(j) for j in i.shape] emu_args.append(numpy.random.randn(*shape).astype(i.dtype)) nd_args.append(tvm.nd.array(emu_args[-1], ctx)) elif isinstance(i, tvm.tir.Var): emu_args.append(tvm_val_2_py_val(i)) nd_args.append(emu_args[-1]) else: assert isinstance(i, list) emu_args.append(numpy.array(i)) compile_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))] + \ (outs if isinstance(outs, list) else [outs]) module = tvm.build(sch, compile_args, target=target) assert module out_tensors = [] for i in range(op.num_outputs): output = op.output(i) shape = [tvm_val_2_py_val(j) for j in output.shape] nd_args.append( tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx)) out_tensors.append(nd_args[-1]) ref_data = func(*emu_args) if isinstance(ref_data, numpy.ndarray): ref_data = [ref_data] module(*nd_args) for nd, np in zip(out_tensors, ref_data): tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5) module_args = [ i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var)) ] module_outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs h_module = te.hybrid.build(sch, module_args, module_outs) return h_module, module_args, module_outs
def search_common( workload=matmul_auto_scheduler_test, target="llvm", search_policy="empty", seed=random.randint(1, 1 << 30), runner="local", cost_model=auto_scheduler.RandomModel(), num_measure_trials=2, init_search_callbacks=None, ): print("Test %s schedule search with the default search policy" % (target)) random.seed(seed) N = 128 workload_key = auto_scheduler.make_workload_key(workload, (N, N, N)) dag = auto_scheduler.ComputeDAG(workload_key) target = tvm.target.Target(target) task = auto_scheduler.SearchTask(dag, workload_key, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name init_search_callbacks = init_search_callbacks or [] init_search_callbacks.append( auto_scheduler.PreloadMeasuredStates(log_file)) if search_policy == "empty": search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == "sketch": search_policy = auto_scheduler.SketchPolicy( task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks) tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, runner=runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options) print("*" * 80) print(target) print("*" * 80) inp, res = auto_scheduler.load_best(log_file, workload_key, target) print("==== Python Code ====") print(dag.print_python_code_from_state(inp.state)) try: print("==== Lowered Stmt ====") print(tvm.lower(sch, args, simple_mode=True)) mod = tvm.build(sch, args, target) ctx = tvm.context(str(target), 0) dtype = dag.tensors[0].dtype a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) mod(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) print("==== Verification passed ====") except Exception: raise Exception("Error encountered with seed: %d" % (seed)) print()
def test_correctness_layout_rewrite_rewrite_for_preTransformed(): N = 128 target = tvm.target.Target("llvm") task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target) dag = task.compute_dag with tempfile.NamedTemporaryFile() as fp: log_file = fp.name search_policy = auto_scheduler.SketchPolicy(task) measure_ctx = auto_scheduler.LocalRPCMeasureContext() tuning_options = auto_scheduler.TuningOptions( num_measure_trials=2, runner=measure_ctx.runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target) s, bufs = dag.apply_steps_from_state( inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG. RewriteForPreTransformed) s_ref, bufs_ref = dag.apply_steps_from_state(inp.state) np_args = [ np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs ] np_args_ref = [np.array(x) for x in np_args] weight = np_args_ref[1] # infer shape for the rewritten layout if len(weight.shape) >= 6: # For cpu tile structure SSRSRS base = len(weight.shape) - 6 red_dim = weight.shape[2 + base] * weight.shape[4 + base] out_dim = weight.shape[3 + base] * weight.shape[5 + base] for i in range(base + 2): out_dim *= weight.shape[i] new_order = ([ 2 + base, 4 + base, ] + list(range(base + 2)) + [ 3 + base, 5 + base, ]) np_args_ref[1] = np_args_ref[1].transpose(new_order) np_args_ref[1] = np_args_ref[1].reshape((red_dim, out_dim)) func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) ctx = tvm.context(str(target)) ctx_ref = tvm.cpu() args = [tvm.nd.array(x, ctx=ctx) for x in np_args] args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args_ref] ctx.sync() func(*args) func_ref(*args_ref) ctx.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), rtol=1e-4) tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), rtol=1e-4) del measure_ctx
# Schedule for W's shared memory load yi, xi, ci, fi = s[WW].op.axis ty, ci = s[WW].split(ci, nparts=num_thread) tx, fi = s[WW].split(fi, nparts=num_thread) _, fi = s[WW].split(fi, factor=4) s[WW].reorder(ty, tx, yi, xi, ci, fi) s[WW].bind(ty, thread_y) s[WW].bind(tx, thread_x) s[WW].vectorize(fi) # vectorize memory load ############################################################################### # Generate CUDA Kernel # -------------------- # # Finally we use TVM to generate and compile the CUDA kernel, and evaluate the # latency of convolution. # func = tvm.build(s, [A, W, B], target='cuda', target_host='llvm') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype) w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx) func(a, w, b) evaluator = func.time_evaluator(func.entry_name, ctx, number=1) print('Convolution: %f ms' % (evaluator(a, w, b).mean * 1e3))
yi, xi, ci, fi = s[WW].op.axis ty, ci = s[WW].split(ci, nparts=num_thread) tx, fi = s[WW].split(fi, nparts=num_thread) _, fi = s[WW].split(fi, factor=4) s[WW].reorder(ty, tx, yi, xi, ci, fi) s[WW].bind(ty, thread_y) s[WW].bind(tx, thread_x) s[WW].vectorize(fi) # vectorize memory load ############################################################################### # Generate CUDA Kernel # -------------------- # # Finally we use TVM to generate and compile the CUDA kernel, and evaluate the # latency of convolution. # func = tvm.build(s, [A, W, B], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype) w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array( np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx) func(a, w, b) evaluator = func.time_evaluator(func.entry_name, ctx, number=1) print('Convolution: %f ms' % (evaluator(a, w, b).mean * 1e3))