def test_min_repeat_ms(): tmp = tempdir() filename = tmp.relpath("log") @tvm.register_func def my_debug(filename): """one call lasts for 100 ms and writes one character to a file""" time.sleep(0.1) with open(filename, "a") as fout: fout.write("c") X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename)) s = tvm.create_schedule(X.op) func = tvm.build(s, [X]) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) ftimer(x) with open(filename, "r") as fin: ct = len(fin.readline()) assert ct == 2 ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1, min_repeat_ms=1000) ftimer(x) # make sure we get more than 10 calls with open(filename, "r") as fin: ct = len(fin.readline()) assert ct > 10 + 2
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_graph(net, target=target, shape={'data': data_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d,)) # run tuning tasks print("Tuning...") tune_kernels(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, shape={'data': data_shape}, params=params, dtype=dtype) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.set_input(x=a) #verify dumproot created directory = mod._dump_path assert(os.path.exists(directory)) #verify graph is there GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json' assert(len(os.listdir(directory)) == 1) #verify the file name is proper assert(os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME))) mod.run() #Verify the tensors are dumped assert(len(os.listdir(directory)) > 1) #verify the output is correct out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1) mod.exit() #verify dump root delete after cleanup assert(not os.path.exists(directory))
def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar): input_dtype = 'uint32' out_dtype = 'int16' with tvm.target.create('llvm'): A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A') B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B') C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype, unipolar=unipolar) s = topi.generic.schedule_bitserial_dense([C]) a_shape = get_const_tuple(A.shape) b_shape = get_const_tuple(B.shape) @memoize("topi.tests.test_topi_bitseral_dense") def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype) if unipolar: b_ = np.copy(b_np).astype(out_dtype) for x in np.nditer(b_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 c_np = np.dot(a_np, b_.T) else: c_np = np.dot(a_np, b_np.T) return a_np, b_np, c_np a_np, b_np, c_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func = tvm.build(s, [A, B, C], "llvm") func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def check_c(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) mhost = tvm.codegen.build_module(fsplits[0], "c") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m["fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy())
def test_in_bounds_vectorize_llvm(): n = 512 lanes = 2 A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes) B = tvm.compute((n,), lambda i: A[i], name='B') C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], nparts=2) _, xi = s[C].split(xi, factor=2) s[C].parallel(xo) s[C].vectorize(xi) s[B].compute_at(s[C], xo) xo, xi = s[B].split(B.op.axis[0], factor=2) s[B].vectorize(xi) # build and invoke the kernel. lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False) print (lowered_func.body) f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n,), A.dtype).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), C.dtype, ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding): in_height = in_width = in_size with tvm.target.rasp(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') B = topi.nn.conv2d(A, W, stride, padding) s = topi.generic.schedule_conv2d_nchw([B]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_conv2d") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], "llvm") func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=True): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True): print("skip because extern function is not available") return if not nnpack.is_available(): return ctx = tvm.cpu(0) transformed_kernel = nnpack.convolution_inference_weight_transform( kernel, algorithm=algorithm) output = nnpack.convolution_inference_without_weight_transform( data, transformed_kernel, bias if with_bias else None, [PAD, PAD, PAD, PAD], [STRIDE, STRIDE], algorithm=algorithm) s = tvm.create_schedule(output.op) f = tvm.build(s, [data, kernel, bias, output], target) na = np.random.uniform(size=dshape).astype(data.dtype) nb = np.random.uniform(size=kshape).astype(kernel.dtype) nc = np.random.uniform(size=bshape).astype(bias.dtype) if with_bias else np.zeros(bshape, dtype=bias.dtype) ta = tvm.nd.array(na, ctx) tb = tvm.nd.array(nb, ctx) tc = tvm.nd.array(nc, ctx) td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx) f(ta, tb, tc, td) nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1) tvm.testing.assert_allclose( td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def test_log_pow_llvm(): # graph n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.power(tvm.log(A(*i)), 2.0), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. bx, tx = s[B].split(B.op.axis[0], factor=32) # one line to build the function. if not tvm.module.enabled("llvm"): return flog = tvm.build(s, [A, B], "llvm", name="mylog") ctx = tvm.cpu(0) # launch the kernel. n = 1028 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) repeat = 10 ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat) res = ftimer(a, b) assert(len(res.results) == repeat) np.testing.assert_allclose( b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
def test_nms(): dshape = (1, 5, 6) data = sym.Variable("data") valid_count = sym.Variable("valid_count", dtype="int32") nms_threshold = 0.7 force_suppress = True nms_topk = 2 out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold, force_suppress=force_suppress, nms_topk=nms_topk) np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80], [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79], [1, 0.5, 100, 60, 70, 110]]]).astype("float32") np_valid_count = np.array([4]).astype("int32") np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79], [-1, -1, -1, -1, -1, -1]]]) target = "llvm" ctx = tvm.cpu() graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)}, dtype={"data": "float32", "valid_count": "int32"}) m = graph_runtime.create(graph, lib, ctx) m.set_input(**{"data": np_data, "valid_count": np_valid_count}) m.run() out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32")) tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
def test_multibox_transform_loc(): batch_size = 1 num_anchors = 3 num_classes = 3 cls_prob = sym.Variable("cls_prob") loc_preds = sym.Variable("loc_preds") anchors = sym.Variable("anchors") transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds, anchor=anchors) out = sym.nms(data=transform_loc_data, valid_count=valid_count) # Manually create test case np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]]) np_loc_preds = np.array([[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]]) np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]]) expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108], [0, 0.44999999, 1, 1, 1, 1], [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]]) target = "llvm" dtype = "float32" ctx = tvm.cpu() graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes), "loc_preds": (batch_size, num_anchors * 4), "anchors": (1, num_anchors, 4)}) m = graph_runtime.create(graph, lib, ctx) m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)}) m.run() out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype)) tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
def test_in_bounds_conv_llvm(loop_tiling=False): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis if loop_tiling: oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) print (lowered_func.body) ctx = tvm.cpu (0) f = tvm.build(s, [data, kernel, conv], "llvm") data_input = tvm.nd.array(np.random.uniform( size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx) kernel_input = tvm.nd.array(np.random.uniform( size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx) conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx) f(data_input, kernel_input, conv_out)
def test_dilate(): target = 'llvm' ctx = tvm.cpu(0) def _test_dilate(input_size, strides): Input = tvm.placeholder((input_size)) Output = topi.nn.dilate(Input, strides) schedule = tvm.create_schedule(Output.op) input_np = np.random.uniform(size=input_size).astype(Input.dtype) output_np = topi.testing.dilate_python(input_np, strides) input_tvm = tvm.nd.array(input_np, ctx=ctx) output_size = topi.util.get_const_tuple(Output.shape) output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx) f = tvm.build(schedule, [Input, Output], target) f(input_tvm, output_tvm) tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5) _test_dilate((32,), (2,)) _test_dilate((32,32), (2,2)) _test_dilate((1,3,32,32), (1,1,1,1)) _test_dilate((1,3,32,32), (2,2,2,2)) _test_dilate((1,32,32,3,3), (1,1,1,1,1)) _test_dilate((1,32,32,3,3), (2,2,2,2,2)) _test_dilate((1,32,32,32,3,3), (1,1,1,2,2,2)) _test_dilate((1,32,32,32,3,3), (2,2,2,1,1,1))
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def ctx_list(): """Get context list for testcases""" device_list = os.environ.get("NNVM_TEST_TARGETS", "") device_list = (device_list.split(",") if device_list else ["llvm", "cuda"]) device_list = set(device_list) res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))] return [x for x in res if x[1].exist and x[0] in device_list]
def build_and_run(sym, params, data, out_shape): ctx = tvm.cpu(0) graph, lib, params = nnvm.compiler.build(sym, "llvm", shape={"data":data.shape}, params=params) module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) return out.asnumpy()
def test_ctx(): def test_ctx_func(ctx): assert tvm.gpu(7) == ctx return tvm.cpu(0) x = test_ctx_func(tvm.gpu(7)) assert x == tvm.cpu(0) x = tvm.opencl(10) x = tvm._api_internal._context_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10)
def verify(graph, lib): m = graph_runtime.create(graph, lib, tvm.cpu(0)) # get member functions na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) m.run(x=na, y=nb) # get outputs out = m.get_output(0, tvm.nd.empty(shape, dtype)) tvm.testing.assert_allclose( out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
def check_llvm(): if not tvm.module.enabled("llvm"): return f = tvm.build(s, [A, B], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=a) out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device, name="clip") f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.set_input(x=a) #verify dumproot created directory = mod._dump_path assert(os.path.exists(directory)) #verify graph is there GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json' assert(len(os.listdir(directory)) == 1) #verify the file name is proper assert(os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME))) mod.run() #Verify the tensors are dumped assert(len(os.listdir(directory)) > 1) CHROME_TRACE_FILE_NAME = '_tvmdbg_execution_trace.json' assert(os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME))) with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f: trace = json.load(f) assert trace["displayTimeUnit"] == "ns" events = trace["traceEvents"] assert len(events) == 4 assert all(event["ph"] in ('B', 'E') for event in events) assert all(event["pid"] == 1 for event in events) assert all(event["tid"] == 1 for event in events) assert all(event["name"] == 'x' for event in events[:2]) assert all(event["name"] == 'add' for event in events[2:]) assert events[0]["ts"] == 0 assert events[0]["ph"] == 'B' #verify the output is correct out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1) #test individual run mod.run_individual(20, 2, 1) mod.exit() #verify dump root delete after cleanup assert(not os.path.exists(directory))
def test_num_outputs(): x = sym.Variable('x') z = sym.split(x, indices_or_sections=5, axis=1) shape = (10, 10) dtype = tvm.float32 nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) params = {"x": nx} graph, lib, params = nnvm.compiler.build( z, "llvm", shape={"x": nx.shape}, params=params) m = graph_runtime.create(graph, lib, tvm.cpu(0)) assert m.get_num_outputs() == 5
def enabled_ctx_list(): ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), ('vulkan', tvm.vulkan(0)), ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] return ctx_list
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return target = topi.cpp.TEST_create_target(device) s = topi.cpp.generic.default_schedule(target, [B], False) ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device, name="clip") f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify_leaky_relu(m, alpha): A = tvm.placeholder((m,), name='A') B = topi.nn.leaky_relu(A, alpha) s = tvm.create_schedule([B.op]) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], "llvm", name="leaky_relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify(target): if not tvm.module.enabled(target): print("Target %s is not enabled" % target) return f = tvm.codegen.build_module(fapi, target) # verify ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)
def test_nhwc(): data_shape = (1, 3, 224, 224) out_channel = 8 nchw_sym = get_sym("NCHW", "OIHW", out_channel) nhwc_sym = get_sym("NHWC", "HWIO", out_channel) conv_weight = np.random.uniform(-1, 1, (out_channel, 3, 3, 3)).astype(np.float32) conv_bias = np.random.uniform(-1, 1, (out_channel)).astype(np.float32) nchw_params = { "conv2d0_weight" : tvm.nd.array(conv_weight, ctx=tvm.cpu(0)), "conv2d0_bias" : tvm.nd.array(conv_bias, ctx=tvm.cpu(0)) } nhwc_params = { "conv2d1_weight" : tvm.nd.array(conv_weight.transpose(2, 3, 1, 0), ctx=tvm.cpu(0)), "conv2d1_bias" : tvm.nd.array(conv_bias, ctx=tvm.cpu(0)) } data = np.random.uniform(-1, 1, data_shape).astype(np.float32) oshape = (1, out_channel, 224, 224) oshape_nhwc = (1, 224, 224, out_channel) nchw_output = build_and_run(nchw_sym, nchw_params, data, oshape) nhwc_output = build_and_run(nhwc_sym, nhwc_params, data.transpose(0, 2, 3, 1), oshape_nhwc) tvm.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
def check_llvm(): if not tvm.module.enabled("llvm"): return # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1 + 1)
# with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import tvm from tvm.relay import Function, transform from tvm.relay.testing import inception_v3 import pytest cpu_scope = tvm.target.make_se_scope(tvm.cpu(), tvm.target.Target("llvm")) metatable = {"SEScope": [cpu_scope]} core = tvm.IRModule() core.import_from_std("core.rly") def optimize_and_check(before_program, after_program, passes): if isinstance(before_program, str): before_program = tvm.parser.parse(before_program) if isinstance(after_program, str): after_program = tvm.parser.parse(after_program) if not isinstance(passes, list): passes = [passes] optimize = tvm.transform.Sequential(passes) optimized_program = optimize(before_program) print("Actual:")
def test_extern_dnnl(): if not tvm.get_global_func("relay.ext.dnnl", True): print("skip because DNNL codegen is not available") return dtype = 'float32' ishape = (1, 32, 14, 14) w1shape = (32, 1, 3, 3) def expected(): data0 = relay.var("data", shape=(ishape), dtype=dtype) input0 = relay.var("input0", shape=(w1shape), dtype=dtype) input1 = relay.var("input1", shape=(w1shape), dtype=dtype) depthwise_conv2d_1 = relay.nn.conv2d(data0, input0, kernel_size=(3, 3), padding=(1, 1), groups=32) depthwise_conv2d_2 = relay.nn.conv2d(depthwise_conv2d_1, input1, kernel_size=(3, 3), padding=(1, 1), groups=32) out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2) func = relay.Function([data0, input0, input1], out) func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Compiler", "dnnl") func = func.with_attr("global_symbol", "dnnl_0") glb_var = relay.GlobalVar("dnnl_0") mod = tvm.IRModule() mod[glb_var] = func data = relay.var("data", shape=(ishape), dtype=dtype) weight = relay.var("input", shape=(w1shape), dtype=dtype) main_f = relay.Function([data, weight], glb_var(data, weight, weight)) mod["main"] = main_f return mod def get_func(): data = relay.var("data", shape=(ishape), dtype=dtype) weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype) depthwise_conv2d_1 = relay.nn.conv2d(data, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32) depthwise_conv2d_2 = relay.nn.conv2d(depthwise_conv2d_1, weight1, kernel_size=(3, 3), padding=(1, 1), groups=32) out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2) return relay.Function([data, weight1], out) mod = tvm.IRModule() mod["main"] = WholeGraphAnnotator("dnnl").visit(get_func()) mod = transform.PartitionGraph()(mod) assert tvm.ir.structural_equal(mod, expected(), map_free_vars=True) ref_mod = tvm.IRModule() ref_mod["main"] = get_func() i_data = np.random.uniform(0, 1, ishape).astype(dtype) w1_data = np.random.uniform(0, 1, w1shape).astype(dtype) ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu()) ref_res = ref_ex.evaluate()(i_data, w1_data) check_result(mod, { "data": i_data, "weight1": w1_data }, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5)
shape = (oc, ic, h, w) oc_bn, ic_bn = 3, 2 # (OC, IC, h, w, ic, oc) out_shape = (oc // oc_bn, ic // ic_bn, h, w, ic_bn, oc_bn) x = sym.Variable("x") # y = sym.Variable("y") # z = sym.elemwise_add(x, sym.sqrt(y)) # z = sym.reshape(x, shape=out_shape) z = sym.reorder(x, oc_bn=3, ic_bn=2) compute_graph = nnvm.graph.create(z) print("-------compute graph-------") print(compute_graph.ir()) deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target="llvm", shape={"x": shape}, dtype="float32") module = graph_runtime.create(deploy_graph, lib, tvm.cpu(0)) x_np = np.random.uniform(0, 255, size=shape).astype("float32") print(x_np) # y_np = np.array([[4, 4], [4, 4], [4, 4]]).astype("float32") # set input to the graph module module.set_input(x=x_np) #, y=y_np) # run forward computation module.run() # get the first output out = module.get_output(0, out=tvm.nd.empty(out_shape)) print(out.asnumpy())
) # Build the module against to x86 CPU target = "llvm" with transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) ###################################################################### # Execute on TVM # -------------- import tvm from tvm import te from tvm.contrib import graph_runtime as runtime # Create a runtime executor module module = runtime.GraphModule(lib["default"](tvm.cpu())) # Feed input data module.set_input(input_tensor, tvm.nd.array(image_data)) # Run module.run() # Get output tvm_output = module.get_output(0).asnumpy() ###################################################################### # Display results # --------------- # Load label file
graph_json_path = "../tvm_output_lib/mobilenet.json" # with open(graph_json_path, 'w') as fo: # fo.write(graph) param_path = "../tvm_output_lib/mobilenet.params" # with open(param_path, 'wb') as fo: # fo.write(relay.save_param_dict(params)) # load the module back. loaded_json = open(graph_json_path).read() loaded_lib = tvm.module.load(libpath) loaded_params = bytearray(open(param_path, "rb").read()) ctx = tvm.cpu() module = graph_runtime.create(loaded_json, loaded_lib, ctx) module.load_params(loaded_params) module.set_input("0", x) module.run() out_deploy = module.get_output(0).asnumpy() print(out_deploy)
import tvm import numpy as np # The size of the square matrix N = 1024 nstep = 1<<16 # The default tensor type in tvm dtype = "float32" target = "llvm -mcpu=skylake-avx512" # target = "llvm" # Random generated tensor for testing a = tvm.nd.array(np.random.rand(N,).astype(dtype), tvm.cpu(0)) c = tvm.nd.array(np.zeros((N,), dtype = dtype), tvm.cpu(0)) # The expected answer answer = a.asnumpy() * nstep x = tvm.placeholder((N,), name="x") k = tvm.reduce_axis((0, nstep)) y = tvm.compute((N,), lambda i: tvm.sum(x[i], axis=k), name="y") s = tvm.create_schedule(y.op) i, = s[y].op.axis io, ii = s[y].split(i, factor=32) s[y].vectorize(ii) print(tvm.lower(s, [x, y], simple_mode=True)) func = tvm.build(s, [x, y], target=target, name = 'adddd') assert func
def build_and_run( mod, inputs, outputs, params, ctx=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1 ): lib = build(mod, params, npu, expected_host_ops, npu_partitions) return run(lib, inputs, outputs, npu)
# ------------------- ############################################################################### # Print the top-5 labels for MXNet and TVM inference. # Checking the labels because the requantize implementation is different between # TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via labels. print("TVM Top-5 labels:", tvm_pred) print("TFLite Top-5 labels:", tflite_pred) ########################################################################## # Measure performance # ------------------- # Here we give an example of how to measure performance of TVM compiled models. n_repeat = 100 # should be bigger to make the measurement more accurate dev = tvm.cpu(0) ftimer = rt_mod.module.time_evaluator("run", dev, number=1, repeat=n_repeat) prof_res = np.array(ftimer().results) * 1e3 print("Elapsed average ms:", np.mean(prof_res)) ###################################################################### # .. note:: # # Unless the hardware has special support for fast 8 bit instructions, quantized models are # not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does # quantized convolution in 16 bit, even if the model itself is 8 bit. # # For x86, the best performance can be achieved on CPUs with AVX512 instructions set. # In this case, TVM utilizes the fastest available 8 bit instructions for the given target. # This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer). # For EC2 C5.12x large instance, TVM latency for this tutorial is ~2 ms.
# Compilation #print(tvm.lower(s, [x, y, x_expand, w, dot, gradient, new_w], # simple_mode=True)) func = tvm.build(s, [x, y, w, new_w]) assert func #print("------func code------") #print(func.imported_modules[0].get_source()) # Generate x np_x = np.random.uniform(size=(in_n, in_d), low=0, high=100) golden_w = np.random.uniform(size=in_d, low=-1, high=1) noise = np.random.uniform(size=in_n, low=-5, high=5) np_y = np.array([golden_w.dot(np_x[i]) + noise[i] for i in range(in_n)]) in_y = tvm.nd.array(np_y.astype(y.dtype), tvm.cpu(0)) in_x = tvm.nd.array(np_x.astype(x.dtype), tvm.cpu(0)) in_w = tvm.nd.array(np.zeros(in_d + 1, dtype=w.dtype), tvm.cpu(0)) # Evaluation display_animation = display_on if display_on and in_d != 1: display_animation = False print("WARNING: Will only display the MSE trend " + "due to high dimensional data points.") if display_on: import matplotlib.pyplot as plt from matplotlib import animation fig = plt.figure() ax = plt.axes()
def build_and_run(s, Tensor, control_f, shape, time_count, timeout_build, timeout_cal, count=20, device_id=0, tar="llvm"): """ Build and record the time of running. Args: ----------------------------- s : schedule.Schedule get form the student's auto_schedule Tensor : (list) the input tensors and the output tensor control_f : the torch function shape : arg for control_f time_count : used for record the running time timeout_build:time limit for building timeout_cal : time limit for culation count : the number rounds repeat testing device_id : the id of CPU ----------------------------- Returns: ----------------------------- [tvm_time, torch_time]: [float , flaot] which indicates the total time of running scheduled tvm calculation and the total time of running torch calculation ----------------------------- """ # Create ctx. try: ctx = tvm.cpu(device_id) except: print("Can not found device !!!") time_count.put([-1, -1]) return -1 # Build function form s and Tensor. try: timelimit = ceil(timeout_build) signal.signal(signal.SIGALRM, handler) signal.alarm(timelimit) begin = time.time() f = tvm.build(s, Tensor, name="my_op") timepass = time.time() - begin signal.signal(signal.SIGALRM, signal.SIG_IGN) if timepass > timeout_build: print("Timeout in building!") return -1 except: traceback.print_exc() print("Can not build successfully !!!") time_count.put([-1, -1]) return -1 try: Output_tensor = Tensor[-1] del Tensor[-1] except: print("The input is not correct !!!") time_count.put([-1, -1]) return -1 # Craft input data. try: Input_tvm_batch = [] Input_torch_batch = [] for it in range(0, count): Input_tvm_data = [] Input_torch_data = [] for i in Tensor: data = np.random.random([int(j) for j in i.shape]).astype( np.float32) * 100 tvm_data = tvm.nd.array(data, ctx) torch_data = torch.tensor(data) Input_tvm_data.append(tvm_data) Input_torch_data.append(torch_data) Output_holder = tvm.nd.array( np.zeros([int(j) for j in Output_tensor.shape], dtype=Output_tensor.dtype), ctx) Input_tvm_batch.append(Input_tvm_data + [Output_holder]) Input_torch_batch.append(Input_torch_data) except: traceback.print_exc() print("Can not create input datas !!!") time_count.put([-1, -1]) return -1 try: f(*Input_tvm_batch[0]) timelimit = ceil(timeout_cal) signal.signal(signal.SIGALRM, handler) signal.alarm(timelimit) begin = time.time() for i in range(0, count): f(*Input_tvm_batch[i]) tvm_time = time.time() - begin signal.signal(signal.SIGALRM, signal.SIG_IGN) if tvm_time > timeout_cal: print("Results of shape", shape, "Timeout!") tvm_time = -1 else: tvm_time /= count except TimeoutError: tvm_time = -1 print("Results of shape", shape, "Timeout!") except: tvm_time = -1 print("Results of shape", shape, "\n| The culation is not correct !!!") try: control_f(*(Input_torch_batch[0] + [shape])) begin = time.time() for i in range(0, count): control_f(*(Input_torch_batch[i] + [shape])) torch_time = time.time() - begin torch_time /= count except TimeoutError: torch_time = -1 print("Results of shape", shape, "Timeout!") except: torch_time = -1 print("Results of shape", shape, "\n| The culation is not correct !!!") print("Results of shape", shape, " \n| your time:", tvm_time, " s| pytorch time:", torch_time, "s\n") time_count.put([tvm_time, torch_time])
def veval(vm, *args, ctx=tvm.cpu()): assert isinstance(vm, _vm.VirtualMachine), "expected VirtualMachine" vm.init(ctx) ret = vm.run(*args) return ret
def run_timing(device, platform, model, remote=None, autotvm_log=None, batch=1, runs=3, reps=5, log=None): """ Run a time trail on TVM :param device: The device to run this on :param platform: The platform get the machine learning model on :param model: The machine learning model to use :param remote: Details about the remote device :param autotvm_log: The path to the auto TVM file :param batch: The number of pictures to run in one go :param runs: The number of runs to run the picture through :param reps: The number of times the measurement should be repeated :param log: The output file """ # Output details of run from cpuinfo import get_cpu_info from datetime import datetime print("\n──────────────────────────── TVMUI ────────────────────────────\n") log.write("TVM Time Trial\n") log_print(log, "Started on " + str(datetime.now().strftime("%m/%d/%Y at %H:%M:%S"))) if remote is None: log_print(log, 'Hardware: ' + device) if device == 'x86': log_print(log, 'CPU Type: ' + get_cpu_info().get('brand_raw')) else: log_print(log, 'Remote Name: ' + remote["name"]) log_print(log, 'Remote Device: ' + remote["type"]) log_print(log, 'Remote Hardware: ' + remote["hardware"]) log_print(log, 'Backend: ' + platform) log_print(log, 'Model: ' + model) log_print(log, str(batch) + " picture(s) per run") log_print(log, str(runs) + " run average, repeated " + str(reps) + " times.") if autotvm_log is None: log_print(log, 'AutoTVM: No\n') else: log_print(log, 'AutoTVM: Yes\n') # Get the model and image data import numpy as np from PIL import Image from tvm import relay import tvm from tvm.contrib.download import download_testdata print("Loading models and images...") pictures = get_pics(batch) dataset = [] if platform == "MXNet": from mxnet.gluon.model_zoo.vision import get_model block = get_model(model, pretrained=True) synset_url = "".join( [ "https://gist.githubusercontent.com/zhreshold/", "4d0b62f3d01426887599d4f7ede23ee5/raw/", "596b27d23537e5a1b5751d2b0481ef172f58b539/", "imagenet1000_clsid_to_human.txt", ] ) synset_name = "imagenet1000_clsid_to_human.txt" synset_path = download_testdata(synset_url, synset_name, module="data") with open(synset_path) as f: synset = eval(f.read()) def transform_image(image): image = np.array(image) - np.array([123.0, 117.0, 104.0]) image /= np.array([58.395, 57.12, 57.375]) image = image.transpose((2, 0, 1)) image = image[np.newaxis, :] return image if model == 'resnet18_v1' or model == 'mobilenetv2_1.0': for img in pictures: dataset.append(transform_image(Image.open(img).resize((224, 224)))) input_shape = [batch, 3, 224, 224] elif model == 'inceptionv3': for img in pictures: dataset.append(transform_image(Image.open(img).resize((299, 299)))) input_shape = [batch, 3, 299, 299] else: raise Exception("Invalid Model") shape_dict = {"data": input_shape} mod, params = relay.frontend.from_mxnet(block, shape_dict) func = mod["main"] func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) elif platform == "PyTorch": import torch import torchvision model = getattr(torchvision.models, model)(pretrained=True) model = model.eval() # We grab the TorchScripted model via tracing input_shape = [batch, 3, 224, 224] input_data = torch.randn(input_shape) scripted_model = torch.jit.trace(model, input_data).eval() synset_url = "".join( [ "https://raw.githubusercontent.com/Cadene/", "pretrained-models.pytorch/master/data/", "imagenet_synsets.txt", ] ) synset_name = "imagenet_synsets.txt" synset_path = download_testdata(synset_url, synset_name, module="data") with open(synset_path) as f: synsets = f.readlines() synsets = [x.strip() for x in synsets] splits = [line.split(" ") for line in synsets] key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits} class_url = "".join( [ "https://raw.githubusercontent.com/Cadene/", "pretrained-models.pytorch/master/data/", "imagenet_classes.txt", ] ) class_name = "imagenet_classes.txt" class_path = download_testdata(class_url, class_name, module="data") with open(class_path) as f: class_id_to_key = f.readlines() class_id_to_key = [x.strip() for x in class_id_to_key] def transform_image(image): from torchvision import transforms my_preprocess = transforms.Compose( [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) img = my_preprocess(image) return np.expand_dims(img, 0) for img in pictures: dataset.append(transform_image(Image.open(img).resize((224, 224)))) input_name = "data" shape_list = [(input_name, input_shape)] func, params = relay.frontend.from_pytorch(scripted_model, shape_list) elif platform == "TensorFlow": import tensorflow as tf import os try: tf_compat_v1 = tf.compat.v1 except ImportError: tf_compat_v1 = tf import tvm.relay.testing.tf as tf_testing # Base location for model related files. repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/" model_name = "classify_image_graph_def-with_shapes.pb" model_url = os.path.join(repo_base, model_name) # Image label map map_proto = "imagenet_2012_challenge_label_map_proto.pbtxt" map_proto_url = os.path.join(repo_base, map_proto) # Human readable text for labels label_map = "imagenet_synset_to_human_label_map.txt" label_map_url = os.path.join(repo_base, label_map) model_path = download_testdata(model_url, model_name, module=["tf", "InceptionV1"]) map_proto_path = download_testdata(map_proto_url, map_proto, module="data") label_path = download_testdata(label_map_url, label_map, module="data") with tf_compat_v1.gfile.GFile(model_path, "rb") as f: graph_def = tf_compat_v1.GraphDef() graph_def.ParseFromString(f.read()) graph = tf.import_graph_def(graph_def, name="") # Call the utility to import the graph definition into default graph. graph_def = tf_testing.ProcessGraphDefParam(graph_def) # Add shapes to the graph. with tf_compat_v1.Session() as sess: graph_def = tf_testing.AddShapesToGraphDef(sess, "softmax") for img in pictures: dataset.append(np.array(Image.open(img).resize((299, 299)))) shape_dict = {"data": [batch, 3, 299, 299]} dtype_dict = {"DecodeJpeg/contents": "uint8"} mod, params = relay.frontend.from_tensorflow(graph_def, layout=None, shape=shape_dict) else: raise Exception('Not Supported!') # Build the graph if device == 'x86': target = "llvm" ctx = tvm.cpu(0) log_print(log, 'Target: ' + target) elif device == 'Metal': target = "metal" ctx = tvm.metal(0) log_print(log, 'Target: ' + target) elif device == 'arm_cpu': target = tvm.target.arm_cpu(remote["type"]) ctx = tvm.cpu(0) log_print(log, 'Target: ' + remote["type"]) else: target = device ctx = tvm.cpu(0) log_print(log, 'Target: ' + device) log_print(log, 'Actual Model: ' + model + '\n') print('Making the graph...') if autotvm_log is not None: from tvm import autotvm log_print(log, 'Using AutoTVM file ' + autotvm_log) with autotvm.apply_graph_best(autotvm_log): with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, target, params=params) else: with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, target, params=params) print("\nSetting up TVM...") from tvm.contrib import graph_runtime # Remote upload if remote is not None: from tvm import rpc from tvm.contrib import utils, graph_runtime as runtime print("Exporting graph...") tmp = utils.tempdir() lib_fname = tmp.relpath("net.tar") lib.export_library(lib_fname) print("Connecting to device...") remote = rpc.connect(str(remote["ip"]), int(remote["port"])) print("Uploading to device...") remote.upload(lib_fname) lib = remote.load_module("net.tar") if device == 'x86': ctx = remote.cpu(0) elif device == 'Metal': ctx = remote.metal(0) elif device == 'arm_cpu': ctx = remote.cpu(0) else: ctx = remote.cpu(0) dtype = "float32" m = graph_runtime.GraphModule(lib["default"](ctx)) def run_tvm(pics, number, repeat): """ Runs a single inference and gives back the time :param pics: The images(s) to run :param number: The number of times to run the inference :param repeat: The number of times to repeat the measurement :return: An array with the time and the result """ # combine pictures arr = np.ndarray(shape=input_shape, dtype=dtype) p = 0 for ip in pics: arr[p] = ip.astype(dtype) p = p + 1 m.set_input("data", tvm.nd.array(arr)) #Actually run inference time = m.module.time_evaluator("run", ctx, number=number, repeat=repeat)() #Get output res = [] if platform == 'MXNet': for i in range(len(pics)): res.append(synset[np.argmax(m.get_output(0).asnumpy()[i])]) if platform == 'PyTorch': # Get top-1 result for TVM for i in range(len(pics)): top1_tvm = np.argmax(m.get_output(0).asnumpy()[i]) tvm_class_key = class_id_to_key[top1_tvm] res.append(key_to_classname[tvm_class_key]) if platform == 'TensorFlow': pre = np.squeeze(m.get_output(0, tvm.nd.empty(((1, 1008)), "float32")).asnumpy()) node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path, uid_lookup_path=label_path) top_k = pre.argsort()[-5:][::-1] res = node_lookup.id_to_string(top_k[0]) return [time, res] # Run the inferences output = [] total = 0 print("\nRunning inferences...") for i in range(int(len(dataset) / batch)): log_print(log, "\nSet " + str(i + 1) + ":") inp = [] # Create the next batch for j in range(batch): inp.append(dataset[batch * i + j]) # Run inference here output = run_tvm(inp, runs, reps) # Output results e = 0 for rl in output[1]: log_print(log, "Image " + str(e + 1) + " Path: " + pictures[batch * i + e]) log_print(log, "Image " + str(e + 1) + " ID: " + rl) e = e + 1 log_print(log, "Time taken: " + str('%.2f' % (1000 * output[0].mean)) + " ms") total = total + output[0].mean ave = total / int(len(dataset) / batch) log_print(log, '\nAVERAGE TIME: ' + str(ave * 1000) + " ms") log_print(log, "Finished on " + str(datetime.now().strftime("%m/%d/%Y at %H:%M:%S"))) log.close() return
img = Image.open('cat.png').resize((224, 224)) img_ycbcr = img.convert("YCbCr") # convert to YCbCr img_y, img_cb, img_cr = img_ycbcr.split() x = np.array(img_y)[np.newaxis, np.newaxis, :, :] ###################################################################### # Compile the model with relay # --------------------------------------------- target = 'llvm' input_name = '1' shape_dict = {input_name: x.shape} sym, params = relay.frontend.from_onnx(onnx_model, shape_dict) with relay.build_config(opt_level=1): intrp = relay.build_module.create_executor('graph', sym, tvm.cpu(0), target) ###################################################################### # Execute on TVM # --------------------------------------------- dtype = 'float32' tvm_output = intrp.evaluate(sym)(tvm.nd.array(x.astype(dtype)), **params).asnumpy() ###################################################################### # Display results # --------------------------------------------- # We put input and output image neck to neck from matplotlib import pyplot as plt out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode='L')
[ 0., 1., 0., -2., -1., 1., -1.], [ -1., 2., 1., 0., 1., 2., 1.], [-1., -1., 2., 0, 1, 1, -1], [ 1., -1, 0, 0, 1., 0, -1.], [-1., 2., 0, 1., 2., -2., 0.], [ 2., -1., 1., -1., 1., 0., 1.]], [[1., -1., 2., 2., 0., 1., -1.], [ -2., 1., 0., -2., -1., 1., 0.], [ 1., 2., -1., 0., 0., 2., -1.], [-1., -1., 2., 0, 1, 1, -1], [ 0., 1, -2, 0, 1., 0, -1.], [-1., 2., 0, 1., 2., -2., 1.], [ 0., -1., 0., -1., -2., -1., 1.]]]]).astype("float32") params['p0'] = tvm.nd.array(temp_param, ctx=tvm.cpu(0)) module = runtime.create(graph, lib, ctx) module.set_input("data", data) module.set_input(**params) print("%%%%%%params%%%%%%%") print(params) module.run() print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%test8%%%%%%%%%%%%%%%%%%%%%%%") out_shape = (2,2,3,3) out = (module.get_output(5712, tvm.nd.empty(out_shape))).asnumpy() print("----------TEST9----------") print(out) batch, in_channel, in_height, in_width = data.shape print(batch, in_channel, in_height, in_width)
def test_pipe_runtime_error_check(): # This function is used to trigger runtime error by applying wrong logic. if pipeline_executor.pipeline_executor_enabled(): # Get three pipeline modules here. (mod1, mod2, mod3), dshape = get_mannual_mod() # The input or output name is illegal and expects a runtime error. pipe_error = pipeline_executor.PipelineConfig() with pytest.raises(RuntimeError): pipe_error[mod1]["output"][9] with pytest.raises(RuntimeError): pipe_error[mod1]["input"]["data_9"] # The module connection will cause a cycle in DAG and expects runtime error. with pytest.raises(RuntimeError): pipe_error[mod1]["output"][0].connect( pipe_error[mod2]["input"]["data_0"]) pipe_error[mod2]["output"][0].connect( pipe_error[mod1]["input"]["data_0"]) # The module connection is illegal and expects runtime error. with pytest.raises(RuntimeError): pipe_error[mod1]["output"][0].connect( pipe_error[mod1]["input"]["data_0"]) with pytest.raises(RuntimeError): pipe_error[mod1]["input"]["data_0"].connect( pipe_error[mod1]["input"]["data_0"]) with pytest.raises(RuntimeError): pipe_error[mod1]["input"]["data_0"].connect( pipe_error[mod2]["input"]["data_0"]) with pytest.raises(RuntimeError): pipe_error[mod1]["output"][0].connect( pipe_error["input"]["data_0"]) with pytest.raises(RuntimeError): pipe_error["input"]["data_0"].connect( pipe_error[mod1]["output"][0]) with pytest.raises(RuntimeError): pipe_error["output"]["0"].connect(pipe_error[mod1]["output"][0]) # Create pipeline executor to check the executor runtime errors. pipe_config = pipeline_executor.PipelineConfig() pipe_config[mod1].target = "llvm" pipe_config[mod1].dev = tvm.cpu(0) pipe_config["param_group"]["param_0"].connect( pipe_config[mod1]["param"]) pipe_config[mod1]["output"][0].connect(pipe_config["output"]["0"]) # Build and create a pipeline module. with tvm.transform.PassContext(opt_level=3): pipeline_mod_factory = pipeline_executor.build(pipe_config) pipeline_module = pipeline_executor.PipelineModule( pipeline_mod_factory) customized_parameters, _ = recreate_parameters(mod1) # Checking the pipeline executor runtime errors. with pytest.raises(RuntimeError): pipeline_module.set_params("param_0", None) with pytest.raises(RuntimeError): pipeline_module.set_params("param_1", customized_parameters)
def test_pipeline(): if pipeline_executor.pipeline_executor_enabled(): target_list = tvm.testing.enabled_targets() for target in target_list: # Get the three pipeline modules here. (mod1, mod2, mod3), dshape = get_mannual_mod() # Prepare batch data for pipeline computation. datas = [] for i in range(5): datas.append(np.full(dshape, 3 + i).astype("float32")) pipe_config = pipeline_executor.PipelineConfig() customized_parameters, customized_parameters_mod = recreate_parameters( mod1) assert customized_parameters_mod == mod1 # The global parameters group named "param_0" will be connected to "mod1" as parameters. pipe_config["param_group"]["param_0"].connect( pipe_config[mod1]["param"]) # The pipeline input named "data_0" will be connected to a input named "data_0" # of mod1. pipe_config["input"]["data_a"].connect( pipe_config[mod1]["input"]["data_0"]) # The pipeline Input named "data_1" will be connected to a input named "data_1" # of mod2. pipe_config["input"]["data_b"].connect( pipe_config[mod2]["input"]["data_1"]) # The mod1 output[0] will be connected to a input named "data_0" of mod2. pipe_config[mod1]["output"][0].connect( pipe_config[mod2]["input"]["data_0"]) # The mod1 output[1] will be connected to a input named "data_0" of mod3. pipe_config[mod1]["output"][1].connect( pipe_config[mod3]["input"]["data_0"]) # The mod2 output[2] will be connected to a input named "data_1" of mod3. pipe_config[mod2]["output"][0].connect( pipe_config[mod3]["input"]["data_1"]) # The mod1 output[2] will be connected to pipeline output[0]. pipe_config[mod1]["output"][2].connect(pipe_config["output"]["0"]) # The mod3 output[0] will be connected to pipeline output[1]. pipe_config[mod3]["output"][0].connect(pipe_config["output"]["1"]) # Print configueration (print(pipe_config)), the result looks like following. # # Inputs # |data_a: mod1:data_0 # |data_b: mod2:data_1 # # output # |output(1) : mod1.output(2) # |output(2) : mod3.output(0) # # connections # |mod1.output(0)-> mod2.data_0 # |mod1.output(1)-> mod3.data_0 # |mod2.output(0)-> mod3.data_1 # Set other parameters. pipe_config[mod1].target = target[0] pipe_config[mod1].dev = target[1] pipe_config[mod2].target = "llvm" pipe_config[mod2].dev = tvm.cpu(0) pipe_config[mod3].target = "llvm" pipe_config[mod3].dev = tvm.cpu(0) # Here is to check the correctness of the configuration generated by API. mconfig = pipe_config.get_config() assert mconfig["module_connection"] == get_manual_conf( [mod1, mod2, mod3], target) # Build and create a pipeline module. with tvm.transform.PassContext(opt_level=3): pipeline_mod_factory = pipeline_executor.build(pipe_config) # Export the parameter configuration to a file. directory_path = tvm.contrib.utils.tempdir().temp_dir # If the directory does not exist, create it. if not os.path.exists(directory_path): os.makedirs(directory_path) config_file_name = pipeline_mod_factory.export_library( directory_path) # Use the output of build to create and initialize PipelineModule. pipeline_module = pipeline_executor.PipelineModule( pipeline_mod_factory) assert pipeline_module # Use the import function to create and initialize PipelineModule. pipeline_module_test = pipeline_executor.PipelineModule.load_library( config_file_name) assert pipeline_module_test.num_outputs == 2 input_map = pipeline_module_test.get_input_pipeline_map("data_b") assert input_map[0] == "1" and input_map[1] == "data_1" input_map = pipeline_module_test.get_input_pipeline_map("data_a") assert input_map[0] == "0" and input_map[1] == "data_0" module_index = pipeline_module_test.get_params_group_pipeline_map( "param_0") assert module_index == 0 # Using the parameters group name to set parameters. pipeline_module_test.set_params("param_0", customized_parameters) for data in datas: # Getting the result without setting customized parameters. wrong_output = run_modules( mconfig["module_connection"], tvm.cpu(), "llvm", "data_0", data, mod2, "data_1", data, ) # Getting the result with setting customized parameters. normal_output = run_modules( mconfig["module_connection"], tvm.cpu(), "llvm", "data_0", data, mod2, "data_1", data, customized_parameters_mod, customized_parameters, ) pipeline_module_test.set_input("data_a", data) pipeline_module_test.set_input("data_b", data) input_data = pipeline_module_test.get_input("data_a") tvm.testing.assert_allclose(data, input_data.numpy()) # Running the pipeline executor in sequential mode. pipeline_module_test.run(True) outputs = pipeline_module_test.get_output() for i in range(len(outputs)): tvm.testing.assert_allclose(normal_output[i], outputs[i].numpy()) assert not (normal_output[i] == wrong_output[i]).all() pipeline_module_test.stop()
def test_cpu(func, cpu_args): evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=5) ms = evaluator(*cpu_args).mean print('CPU Convolution: %.2f ms' % (ms * 1000))
def get_manual_conf(mods, target): # This function is used to generate manual pipeline configuration. mod_config = {} # The third output is the final output, the second output is for mod3, the first output # is for mod2 input. pipe_config1 = { "mod_idx": 0, "output": [ { "output_idx": 0, "dependencies": [{ "mod_idx": 1, "input_name": "data_0" }] }, { "output_idx": 1, "dependencies": [{ "mod_idx": 2, "input_name": "data_0" }] }, { "output_idx": 2, "dependencies": [{ "global_output_index": 0 }] }, ], } mod_config[mods[0]] = { "pipeline": pipe_config1, "target_host": None, "mod_name": "default", "build": None, "params": None, "target": target[0], "dev": target[1], } pipe_config2 = { "mod_idx": 1, "output": [ { "output_idx": 0, "dependencies": [{ "mod_idx": 2, "input_name": "data_1" }] }, ], } mod_config[mods[1]] = { "pipeline": pipe_config2, "target_host": None, "mod_name": "default", "build": None, "params": None, "target": "llvm", "dev": tvm.cpu(0), } pipe_config3 = { "mod_idx": 2, "output": [{ "output_idx": 0, "dependencies": [{ "global_output_index": 1 }] }], } mod_config[mods[2]] = { "pipeline": pipe_config3, "target_host": None, "mod_name": "default", "build": None, "params": None, "target": "llvm", "dev": tvm.cpu(0), } return mod_config
def run_unpropagatable_graph(dev, tgt): R""" The network is as following: a b c d \ / \ / add mul \ / subtract """ a = relay.var("a", shape=(10, 10)) b = relay.var("b", shape=(10, 10)) c = relay.var("c", shape=(10, 10)) d = relay.var("d", shape=(10, 10)) a_data = np.random.rand(10, 10).astype('float32') b_data = np.random.rand(10, 10).astype('float32') c_data = np.random.rand(10, 10).astype('float32') d_data = np.random.rand(10, 10).astype('float32') tmp_add = a_data + b_data tmp_mul = np.multiply(c_data, d_data) ref_res = np.subtract(tmp_add, tmp_mul) fallback_device = tvm.context("cpu") target = {"cpu": "llvm", dev: tgt} cpu_ctx = fallback_device dev_ctx = tvm.context(dev) def annotated(): add = relay.add(a, b) _add = relay.annotation.on_device(add, dev_ctx) mul = relay.multiply(c, d) _mul = relay.annotation.on_device(mul, cpu_ctx) sub = relay.subtract(add, mul) _sub = relay.annotation.on_device(sub, dev_ctx) func = relay.Function([a, b, c, d], relay.Tuple(tvm.convert([_add, _mul, _sub, sub]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, dev_ctx.device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[3]), func.body[3]) def expected(): add = relay.add(a, b) mul = relay.multiply(c, d) copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx) sub = relay.subtract(add, copy_mul_sub) func = relay.Function([a, b, c, d], sub) return func annotated_func = annotated() expected_func = expected() expected_index = [2, 2, 2, 1, 1, 1, 2, 2] check_annotated_graph(annotated_func, expected_func) params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data} config = {"opt_level": 0} config["fallback_device"] = fallback_device with relay.build_config(**config): graph, lib, params = relay.build(annotated_func, target, params=params) contexts = [tvm.cpu(0), tvm.context(dev)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index mod = graph_runtime.create(graph, lib, contexts) mod.set_input(**params) mod.run() res = mod.get_output(0).asnumpy() tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
def test_saturation(): # Same params data_dtype = 'uint8' x = relay.var("x", shape=(1, 4), dtype=data_dtype) y = relay.var("y", shape=(1, 4), dtype=data_dtype) z = relay.qnn.op.add(lhs=x, rhs=y, lhs_scale=relay.const(0.125, 'float32'), lhs_zero_point=relay.const(0, 'int32'), rhs_scale=relay.const(0.125, 'float32'), rhs_zero_point=relay.const(0, 'int32'), output_scale=relay.const(0.125, 'float32'), output_zero_point=relay.const(0, 'int32')) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 1, 1, 0)).reshape((1, 4)) y_data = np.array((255, 255, 128, 0)).reshape((1, 4)) golden_output = np.array((255, 255, 129, 0)).reshape((1, 4)) intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) # Same params, different scale z = relay.qnn.op.add(lhs=x, rhs=y, lhs_scale=relay.const(0.125, 'float32'), lhs_zero_point=relay.const(0, 'int32'), rhs_scale=relay.const(0.125, 'float32'), rhs_zero_point=relay.const(0, 'int32'), output_scale=relay.const(0.25, 'float32'), output_zero_point=relay.const(0, 'int32')) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 1, 1, 0)).reshape((1, 4)) y_data = np.array((255, 255, 127, 0)).reshape((1, 4)) golden_output = np.array((255, 129, 65, 0)).reshape((1, 4)) intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) # Same io params, different output scale z = relay.qnn.op.add(lhs=x, rhs=y, lhs_scale=relay.const(0.125, 'float32'), lhs_zero_point=relay.const(0, 'int32'), rhs_scale=relay.const(0.125, 'float32'), rhs_zero_point=relay.const(0, 'int32'), output_scale=relay.const(0.25, 'float32'), output_zero_point=relay.const(0, 'int32')) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 1, 1, 0)).reshape((1, 4)) y_data = np.array((255, 255, 127, 0)).reshape((1, 4)) golden_output = np.array((255, 129, 65, 0)).reshape((1, 4)) intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) # All params different z = relay.qnn.op.add(lhs=x, rhs=y, lhs_scale=relay.const(0.5, 'float32'), lhs_zero_point=relay.const(0, 'int32'), rhs_scale=relay.const(0.25, 'float32'), rhs_zero_point=relay.const(0, 'int32'), output_scale=relay.const(0.125, 'float32'), output_zero_point=relay.const(0, 'int32')) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 0, 1, 0)).reshape((1, 4)) y_data = np.array((0, 128, 64, 0)).reshape((1, 4)) golden_output = np.array((255, 255, 132, 0)).reshape((1, 4)) intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output)
def setup_gmod(): loaded_lib = tvm.runtime.load_module(path_lib) dev = tvm.cpu(0) return loaded_lib["default"](dev)
def _run_unlinked(lib_mod): graph_rt = tvm.contrib.graph_runtime.GraphModule( lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input, **params) graph_rt.run() return graph_rt.get_output(0)
# Typically ONNX models mix model input values with parameter values, with # the input having the name `1`. This model dependent, and you should check # with the documentation for your model to determine the full input and # parameter name space. # # Passing in the shape dictionary to the `relay.frontend.from_onnx` method # tells relay which ONNX parameters are inputs, and which are parameters, and # provides a static definition of the input size. target = "llvm" input_name = "1" shape_dict = {input_name: x.shape} mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) with tvm.transform.PassContext(opt_level=1): intrp = relay.build_module.create_executor("graph", mod, tvm.cpu(0), target) ###################################################################### # Execute on TVM # --------------------------------------------- dtype = "float32" tvm_output = intrp.evaluate()(tvm.nd.array(x.astype(dtype)), **params).asnumpy() ###################################################################### # Display results # --------------------------------------------- # We put input and output image neck to neck. The luminance channel, `Y` is the output # from the model. The chroma channels `Cb` and `Cr` are resized to match with a simple # bicubic algorithm. The image is then recombined and converted back to `RGB`. from matplotlib import pyplot as plt
def test_correctness_layout_rewrite_rewrite_for_preTransformed(): N = 128 target = tvm.target.Target("llvm") task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target) dag = task.compute_dag with tempfile.NamedTemporaryFile() as fp: log_file = fp.name search_policy = auto_scheduler.SketchPolicy(task) measure_ctx = auto_scheduler.LocalRPCMeasureContext() tuning_options = auto_scheduler.TuningOptions( num_measure_trials=2, runner=measure_ctx.runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target) s, bufs = dag.apply_steps_from_state( inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG. RewriteForPreTransformed) s_ref, bufs_ref = dag.apply_steps_from_state(inp.state) np_args = [ np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs ] np_args_ref = [np.array(x) for x in np_args] weight = np_args_ref[1] # infer shape for the rewritten layout if len(weight.shape) >= 6: # For cpu tile structure SSRSRS base = len(weight.shape) - 6 red_dim = weight.shape[2 + base] * weight.shape[4 + base] out_dim = weight.shape[3 + base] * weight.shape[5 + base] for i in range(base + 2): out_dim *= weight.shape[i] new_order = ([ 2 + base, 4 + base, ] + list(range(base + 2)) + [ 3 + base, 5 + base, ]) np_args_ref[1] = np_args_ref[1].transpose(new_order) np_args_ref[1] = np_args_ref[1].reshape((red_dim, out_dim)) func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) ctx = tvm.context(str(target)) ctx_ref = tvm.cpu() args = [tvm.nd.array(x, ctx=ctx) for x in np_args] args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args_ref] ctx.sync() func(*args) func_ref(*args_ref) ctx.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3) del measure_ctx
def convert(self, lst, *, target='cpu', dev_id=0): """Converts the list of nodes to a runnable form. All the nodes in the list must represent linear flow (no calls, branches, ...) Returns: (fn, inputs, outputs): - fn: A callable function - inputs: the list of inputs nodes whose values should be provided to the function - outputs: the list of output nodes corresponding to the outputs of the function Notes: This implementation converts the nodes to NNVM and compiles it. """ self.c = count() self.eqv = {} self.inputs = [] self.input_names = [] self.constants = {} self.constant_vars = {} self.shapes = {} self.types = {} for n in lst: assert n.is_apply() assert n.inputs[0].is_constant(Primitive) fn = n.inputs[0].value conv = self.mapping.get(fn, None) if conv is not None: self.eqv[n] = conv(self, *n.inputs[1:]) else: raise NotImplementedError(fn) outputs = get_outputs(lst, lst[0].graph.manager.uses, set(self.eqv.keys())) inmap = dict((self.eqv[i], i) for i in self.inputs) # Check for empty functions if all(self.eqv[o] in inmap for o in outputs): return None, [inmap[self.eqv[o]] for o in outputs], outputs if target == 'cpu': target = 'llvm' g = nnvm.graph.create(sym.Group(list(self.eqv[o] for o in outputs))) dg, lib, params = nnvm.compiler.build(g, target=target, shape=self.shapes, dtype=self.types, params=self.constants) shape = dg.json_attr('shape') types = dg.json_attr('dtype') index = dg.index def spec(entry_id): return (shape[entry_id], graph_attr.TCODE_TO_DTYPE[types[entry_id]]) output_specs = [spec(index.entry_id(x)) for x in index.output_entries] assert len(output_specs) == len(outputs) if target == 'llvm': context = tvm.cpu(dev_id) elif target == 'cuda': # pragma: no cover context = tvm.gpu(dev_id) else: # pragma: no cover raise Exception(f"Unsupported target: {target}") module = graph_runtime.create(dg, lib, context) for n, p in params.items(): module.set_input(n, p) input_types = [self.types[i] for i in self.input_names] return (NNVMRunner(module, self.input_names, input_types, output_specs, context), self.inputs, outputs)
def test_meta_schedule_tune_relay(model_name: str, batch_size: int, target: str): if model_name == "inception_v3" and batch_size == 1: pytest.skip("inception_v3 does not handle batch_size of 1") input_shape: Tuple[int, ...] input_name = "input0" dev = tvm.cpu() if str(target).startswith("llvm") else cuda() if MODEL_TYPES[model_name] == MODEL_TYPE.TEXT_CLASSIFICATION: seq_length = 128 input_name = "input_ids" input_shape = (batch_size, seq_length) data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: if MODEL_TYPES[model_name] == MODEL_TYPE.IMAGE_CLASSIFICATION: input_shape = (batch_size, 3, 299, 299) elif MODEL_TYPES[model_name] == MODEL_TYPE.SEGMENTATION: input_shape = (batch_size, 3, 299, 299) elif MODEL_TYPES[model_name] == MODEL_TYPE.OBJECT_DETECTION: input_shape = (1, 3, 300, 300) elif MODEL_TYPES[model_name] == MODEL_TYPE.VIDEO_CLASSIFICATION: input_shape = (batch_size, 3, 3, 299, 299) else: raise ValueError("Unsupported model: " + model_name) data = tvm.nd.array( np.random.randn(*input_shape).astype("float32"), dev) output_shape: Tuple[int, int] = (batch_size, 1000) mod, params = get_torch_model( model_name=model_name, input_shape=input_shape, output_shape=output_shape, dtype="float32", ) with tempfile.TemporaryDirectory() as work_dir: target = Target(target) database = DummyDatabase() rt_mod: tvm.module = tune_relay( mod=mod, params=params, target=target, config=ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=32, ), work_dir=work_dir, database=database, ) # Compile without meta-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
print(model_jit.graph) print("run torchscript...") for i in range(20): t = time.time() model_jit(x) print(time.time() - t) option = { "input_infos": [ ("x", (1, 3, 224, 224)), ], "default_dtype": "float16", "export_dir": "pytorch_compiled", "num_outputs": 1, "tuning_n_trials": 1, # set zero to skip tuning "tuning_log_file": "tuning.log", "target": "llvm", "device": tvm.cpu(), } pytorch_tvm_module = compile(model_jit, option) torch.jit.script(pytorch_tvm_module).save("model_tvm.pt") print("Run PyTorch...") for i in range(20): t = time.time() outputs = pytorch_tvm_module.forward([x.cpu()]) print(1000 * (time.time() - t)) print(outputs[0].shape)
## need to quantized upfront. In the ONNXRuntime Vitis-AI ## execution provider we make use of On-The-Fly (OTF) Quantization ## to remove this additional preprocessing step. In this flow, ## one doesn't need to quantize his/her model upfront but can ## make use of the typical inference execution calls ## (InferenceSession.run) to quantize the model on-the-fly ## using the first N inputs. This will set up and calibrate ## the Vitis-AI DPU and from that point onwards inference ## will be accelerated for all next inputs. ## Set the number of inputs used for quantization to e.g. 8 ## using the PX_QUANT_SIZE environment variable if you want ## to quantize on fewer inputs. The default is 128. ############################################################ print("Create InferenceSession for OTF Quantization") InferenceSession = graph_runtime.GraphModule(lib["default"](tvm.cpu())) px_quant_size = int(os.environ['PX_QUANT_SIZE']) \ if 'PX_QUANT_SIZE' in os.environ else 128 print("Start OTF Quantization on first {} images".format(px_quant_size)) quant_files = [ os.path.join(QUANT_DIR, f) for f in os.listdir(QUANT_DIR) if f.endswith(('JPEG', 'jpg', 'png')) ][:px_quant_size] quant_images = inputs_func(quant_files) print('Loaded {} inputs successfully.'.format(len(quant_images))) for i in range(px_quant_size): InferenceSession.set_input(input_name, quant_images[i])
map_proto_url = os.path.join(repo_base, map_proto) # Human readable text for labels label_map = "imagenet_synset_to_human_label_map.txt" label_map_url = os.path.join(repo_base, label_map) # Target settings # Use these commented settings to build for cuda. # target = 'cuda' # target_host = 'llvm' # layout = "NCHW" # ctx = tvm.gpu(0) target = "llvm" target_host = "llvm" layout = None ctx = tvm.cpu(0) ###################################################################### # Download required files # ----------------------- # Download files listed above. from tvm.contrib.download import download_testdata img_path = download_testdata(image_url, img_name, module="data") model_path = download_testdata(model_url, model_name, module=["tf", "InceptionV1"]) map_proto_path = download_testdata(map_proto_url, map_proto, module="data") label_path = download_testdata(label_map_url, label_map, module="data") ######################################################################
# target x86 CPU target = "llvm" with relay.build_config(opt_level=3): graph, lib, params = relay.build(mod[mod.entry_func], target, params=params) ###################################################################### # Execute on TVM # --------------------------------------------- import tvm from tvm.contrib import graph_runtime as runtime # create a runtime executor module module = runtime.create(graph, lib, tvm.cpu()) # feed input data module.set_input(input_tensor, tvm.nd.array(image_data)) # feed related params module.set_input(**params) # run module.run() # get output tvm_output = module.get_output(0).asnumpy() ###################################################################### # Display results