def test_gru_like(): def unit(rnn_dim): X = relay.var("X", shape=(1, rnn_dim)) W = relay.var("y", shape=(3 * rnn_dim, rnn_dim)) matmul = relay.nn.dense(X, W) splitted = relay.split(matmul, indices_or_sections=3, axis=1) out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2]) return relay.Function([X, W], out) def sigmoid(x): return 1 / (1 + np.exp(-x)) def unit_numpy(X, W): prod = np.dot(X, W.transpose()) splits = np.split(prod, indices_or_sections=3, axis=1) return sigmoid(splits[0]) + np.tanh(splits[1]) * np.exp(splits[2]) dtype = "float32" rnn_dim = 1000 x = np.random.rand(1, rnn_dim).astype(dtype) y = np.random.rand(3*rnn_dim, rnn_dim).astype(dtype) * 0.01 - 0.005 out_shape = (1, rnn_dim) z = unit(rnn_dim) for target, ctx in ctx_list(): with relay.build_config(opt_level=2): graph, lib, params = relay.build(z, target) m = graph_runtime.create(graph, lib, ctx) m.set_input("X", tvm.nd.array(x.astype(dtype))) m.set_input("y", tvm.nd.array(y.astype(dtype))) m.set_input(**params) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy() ref = unit_numpy(x, y) tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") net, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program(net, target=target, params=params, ops=(relay.op.nn.conv2d,)) # run tuning tasks print("Tuning...") tune_kernels(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( net, target=target, params=params) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_alter_layout_conv2d(): """Additional layout transformations should occour on the graph. """ def convnet(): """Alternating layout of simple convnet (from image super-resolution). """ bias1 = relay.var('bias1', shape=(64,)) bias2 = relay.var('bias2', shape=(64,)) bias3 = relay.var('bias3', shape=(64,)) bias4 = relay.var('bias4', shape=(64,)) weight1 = relay.var('weight1', shape=(64, 1, 5, 5)) weight2 = relay.var('weight2', shape=(64, 64, 3, 3)) weight3 = relay.var('weight3', shape=(64, 64, 3, 3)) weight4 = relay.var('weight4', shape=(64, 64, 3, 3)) data = relay.var("x", shape=(1, 1, 224, 224)) n00 = relay.nn.conv2d(data, weight1, padding=[2, 2], kernel_size=[5, 5]) n01 = relay.expand_dims(bias1, axis=1, num_newaxis=2) n02 = relay.add(n00, n01) n03 = relay.nn.relu(n02) n04 = relay.nn.conv2d(n03, weight2, padding=[1, 1], kernel_size=[3, 3]) n05 = relay.expand_dims(bias2, axis=1, num_newaxis=2) n06 = relay.add(n04, n05) n07 = relay.nn.relu(n06) n08 = relay.nn.conv2d(n07, weight3, padding=[1, 1], kernel_size=[3, 3]) n09 = relay.expand_dims(bias3, axis=1, num_newaxis=2) n10 = relay.add(n08, n09) n11 = relay.nn.relu(n10) n12 = relay.nn.conv2d(n11, weight4, padding=[1, 1], kernel_size=[3, 3]) n13 = relay.expand_dims(bias4, axis=1, num_newaxis=2) n14 = relay.add(n12, n13) n15 = relay.reshape(n14, newshape=[1, 1, 3, 3, 224, 224]) n16 = relay.transpose(n15, axes=[0, 1, 4, 2, 5, 3]) net = relay.reshape(n16, newshape=[1, 1, 672, 672]) args = relay.ir_pass.free_vars(net) return relay.Function(args, net) # orig net N = convnet() N = infer_type(N) # trigger a test # for each known alter_conv2d targets=['cuda', 'opencl -device=mali', 'opencl -device=intel_graphics', 'llvm -device=arm_cpu', 'llvm -device=core-avx-ii'] for tgt in targets: with tvm.target.create(tgt) as target: with relay.build_config(opt_level=-1, add_pass='******'): with autotvm.tophub.context(target): O = relay.optimize(N, target, params=None) O = relay.ir_pass.infer_type(O) # graph should differ assert not relay.ir_pass.alpha_equal(N, O)
def test_compile_placeholder_bypass(): engine = relay.backend.compile_engine.get() x = relay.var("x", shape=(2, 3)) y = relay.var("y", shape=(2, 3)) z = relay.var("z", shape=(2, 3)) result = relay.Tuple([x, relay.op.concatenate([y, z], axis=0)]) func = relay.Function(relay.ir_pass.free_vars(result), result) with relay.build_config(opt_level=0): graph, lib, params = relay.build(func, 'llvm')
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") net, params, input_shape, _ = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program(net, target=target, params=params, ops=(relay.op.nn.conv2d,)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( net, target=target, params=params) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def run_tvm_graph(tflite_model_buf, input_data, input_node, num_output=1, target='llvm', out_names=None): """ Generic function to compile on relay and execute on tvm """ try: import tflite.Model except ImportError: raise ImportError("The tflite package must be installed") # get TFLite model from buffer tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) input_data = convert_to_list(input_data) input_node = convert_to_list(input_node) shape_dict = {} dtype_dict = {} for i, e in enumerate(input_node): shape_dict[e] = input_data[i].shape dtype_dict[e] = input_data[i].dtype.name func, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict) with relay.build_config(opt_level=3): graph, lib, params = relay.build(func, target, params=params) ctx = tvm.context(target, 0) from tvm.contrib import graph_runtime m = graph_runtime.create(graph, lib, ctx) # set inputs for i, e in enumerate(input_node): m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype))) m.set_input(**params) # execute m.run() # get outputs assert out_names is None or num_output == len(out_names), "out_names: {} num_output: {}".format( out_names, num_output) tvm_output_list = [] for i in range(0, num_output): tvm_output = m.get_output(i) tvm_output_list.append(tvm_output.asnumpy()) return tvm_output_list
def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'): shape_dict = {"data": x.shape} if gluon_impl: new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict) else: new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict, arg_params=args, aux_params=auxs) with relay.build_config(opt_level=3): graph, lib, params = relay.build(new_sym, target, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) m.set_input(**params) m.run() # get outputs out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) return out.asnumpy()
def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output_dtype='float32'): """ Generic function to execute and get tvm output""" target = 'llvm' if isinstance(input_data, list): input_names = {} shape_dict = {} dtype_dict = {} for i, _ in enumerate(input_data): input_names[i] = graph_def.graph.input[i].name shape_dict[input_names[i]] = input_data[i].shape dtype_dict[input_names[i]] = input_data[i].dtype else: input_names = graph_def.graph.input[0].name shape_dict = {input_names: input_data.shape} dtype_dict = {input_names: input_data.dtype} sym, params = relay.frontend.from_onnx(graph_def, shape_dict) with relay.build_config(opt_level=1): graph, lib, params = relay.build(sym, target, params=params) ctx = tvm.cpu(0) from tvm.contrib import graph_runtime m = graph_runtime.create(graph, lib, ctx) # set inputs if isinstance(input_data, list): for i, e in enumerate(input_names): m.set_input(input_names[i], tvm.nd.array(input_data[i].astype(input_data[i].dtype))) else: m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype))) m.set_input(**params) # execute m.run() # get outputs if isinstance(output_shape, list) and isinstance(output_dtype, list): tvm_output_list = [] for i, _ in enumerate(output_shape): tvm_output = m.get_output(i) tvm_output_list.append(tvm_output.asnumpy()) return tvm_output_list else: tvm_output = m.get_output(0) return tvm_output.asnumpy()
def get_tvm_output(net, data, params, target, ctx, dtype='float32'): with relay.build_config(opt_level=1): graph, lib, params = relay.build(net, target, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input("data", data) m.set_input(**params) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) if measure: print("Evaluate graph runtime inference time cost...") ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20) # Measure in millisecond. prof_res = np.array(ftimer().results) * 1000 print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) return out.asnumpy()
def test_runtime(target, device, func, fallback_device=None, expected_index=None): params = {"x": x_data, "y": y_data} config = {"opt_level": 1} if fallback_device: config["fallback_device"] = fallback_device with relay.build_config(**config): graph, lib, params = relay.build( func, target, params=params) contexts = [tvm.cpu(0), tvm.context(device)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index mod = graph_runtime.create(graph, lib, contexts) mod.set_input(**params) mod.run() res = mod.get_output(0).asnumpy() tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
def build_module(opts): dshape = (1, 3, 224, 224) from mxnet.gluon.model_zoo.vision import get_model block = get_model('mobilenet0.25', pretrained=True) shape_dict = {'data': dshape} mod, params = relay.frontend.from_mxnet(block, shape_dict) func = mod["main"] func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) with relay.build_config(opt_level=3): graph, lib, params = relay.build(func, 'llvm --system-lib', params=params) build_dir = os.path.abspath(opts.out_dir) if not os.path.isdir(build_dir): os.makedirs(build_dir) lib.save(os.path.join(build_dir, 'model.o')) with open(os.path.join(build_dir, 'graph.json'), 'w') as f_graph_json: f_graph_json.write(graph) with open(os.path.join(build_dir, 'params.bin'), 'wb') as f_params: f_params.write(relay.save_param_dict(params))
def main(): dshape = (1, 28, 28) net, params = relay.testing.mlp.get_workload(batch_size=dshape[0], dtype='float32') dshape = (1, 3, 224, 224) net, params = relay.testing.resnet.get_workload(layers=18, batch_size=dshape[0], image_shape=dshape[1:]) with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, 'llvm --system-lib', params=params) build_dir = osp.abspath(sys.argv[1]) if not osp.isdir(build_dir): os.makedirs(build_dir, exist_ok=True) lib.save(osp.join(build_dir, 'model.o')) with open(osp.join(build_dir, 'graph.json'), 'w') as f_graph_json: f_graph_json.write(graph) with open(osp.join(build_dir, 'params.bin'), 'wb') as f_params: f_params.write(relay.save_param_dict(params))
def run_test_conv2d_cuda(dtype, out_dtype, scale, dshape, kshape, padding=(1, 1), groups=1, dilation=(1, 1), **attrs): x = relay.var("x", shape=dshape, dtype=dtype) w = relay.var("w", shape=kshape, dtype=dtype) y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs) func = relay.Function([x, w], y) mod = relay.Module() mod['main'] = func mod = relay.transform.InferType()(mod) data = np.random.uniform(-scale, scale, size=dshape).astype(dtype) kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype) ref_res = topi.testing.conv2d_nchw_python( data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, groups=groups) with WinogradFallback(), relay.build_config(opt_level=3): for target, ctx in ctx_list(): if target != 'cuda': continue params = {'w': tvm.nd.array(kernel)} graph, lib, params = relay.build_module.build(mod, target=target, params=params) module = tvm.contrib.graph_runtime.create(graph, lib, ctx) module.set_input('x', tvm.nd.array(data)) module.set_input(**params) module.run() op_res1 = module.get_output(0) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-3, atol=1e-3)
def test_gru_like(): def unit(rnn_dim): X = relay.var("X", shape=(1, rnn_dim)) W = relay.var("y", shape=(3 * rnn_dim, rnn_dim)) matmul = relay.nn.dense(X, W) splitted = relay.split(matmul, indices_or_sections=3, axis=1) out = relay.sigmoid( splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2]) return relay.Function([X, W], out) def sigmoid(x): return 1 / (1 + np.exp(-x)) def unit_numpy(X, W): prod = np.dot(X, W.transpose()) splits = np.split(prod, indices_or_sections=3, axis=1) return sigmoid(splits[0]) + np.tanh(splits[1]) * np.exp(splits[2]) dtype = "float32" rnn_dim = 1000 x = np.random.rand(1, rnn_dim).astype(dtype) y = np.random.rand(3 * rnn_dim, rnn_dim).astype(dtype) * 0.01 - 0.005 out_shape = (1, rnn_dim) z = unit(rnn_dim) for target, ctx in ctx_list(): with relay.build_config(opt_level=2): graph, lib, params = relay.build(z, target) m = graph_runtime.create(graph, lib, ctx) m.set_input("X", tvm.nd.array(x.astype(dtype))) m.set_input("y", tvm.nd.array(y.astype(dtype))) m.set_input(**params) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy() ref = unit_numpy(x, y) tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
def test_cpu(): mod, params = relay.testing.synthetic.get_workload() with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "llvm", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") # raw api dev = tvm.cpu() gmod = complied_graph_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] set_input("data", tvm.nd.array(data)) run() out = get_output(0).numpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph executor wrapper gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).numpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def test_model(): """Test a program which uses the graph runtime.""" if not tvm.runtime.enabled("micro_dev"): print("not enable micro_dev") return import tflite model_path = "super_resolution.onnx" # now you have super_resolution.onnx on disk onnx_model = onnx.load(model_path) from PIL import Image img_path = "cat.png" input_name = "conv2d_input" shape_dict = {input_name: x.shape} mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) with micro.Session(DEV_RISCV): ctx = tvm.micro_dev(0) disable_vectorize = tvm.target.build_config(disable_vectorize=True) disable_fusion = relay.build_config(disabled_pass={'FuseOps'}) with disable_vectorize: graph, c_mod, params = relay.build(mod, target=TARGET, params=params) print("I Find the wrong") micro_mod = micro.create_micro_mod(c_mod, DEV_RISCV) mod = graph_runtime.create(graph, micro_mod, ctx) mod.set_input(**params) mod.set_input(input_name, tvm.nd.array(x)) tvm_output = mod.get_output(0).asnumpy() print("result is: " + str(tvm_output))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-o', '--out-dir', default='.') opts = parser.parse_args() dshape = (1, 3, 224, 224) net, params = relay.testing.resnet.get_workload(layers=18, batch_size=dshape[0], image_shape=dshape[1:]) with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, 'llvm --system-lib', params=params) build_dir = osp.abspath(opts.out_dir) if not osp.isdir(build_dir): os.makedirs(build_dir, exist_ok=True) lib.save(osp.join(build_dir, 'model.bc')) with open(osp.join(build_dir, 'graph.json'), 'w') as f_graph_json: f_graph_json.write(graph) with open(osp.join(build_dir, 'params.bin'), 'wb') as f_params: f_params.write(relay.save_param_dict(params))
def tune_and_evaluate(tuning_opt, number, tune=True): op, params, data_shape = get_workload(batch_size, image_shape, out_channel, kernel_size, strides, padding) tasks = autotvm.task.extract_from_program(op, target=target, params=params, ops=(relay.op.nn.conv2d, )) log_file = tuning_opt["log_filename"] if tune: print("Tuning...") tune_kernels(tasks, **tuning_opt) if not os.path.exists(log_file): raise RuntimeError("the log file {} doesn't exists".format(log_file)) with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(op, target=target, params=params) ctx = tvm.device(str(target), 0) data_tvm = tvm.nd.array( (np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input("data", data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=number) prof_res = np.array(ftimer().results) * 1e3 print("Time cost is: ", np.mean(prof_res))
def run_inf(mod,img_path,b_size=1): with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(mod, target=tvm.target.cuda()) ctx = tvm.context(str(tvm.target.cuda()), 0) module = runtime.create(graph, lib, ctx) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(img_path, transforms.Compose([ transforms.Resize(256),transforms.CenterCrop(224),transforms.ToTensor(),normalize,])), batch_size=b_size, shuffle=False,num_workers=1, pin_memory=True) total = 0 top1 = 0 start = time.time() total_time = 0 for i, (batch,target) in enumerate(val_loader): data = batch.cpu().numpy() total = i module.set_input('input0', data) module.set_input(**params) module.run() prediction = module.get_output(0) if np.argmax(prediction.asnumpy()[0]) == target.cpu().numpy()[0] : top1 = top1+1 print(top1) #if i > 9: # only run inference on a few samples in this tutorial # break end = time.time() ftimer = module.module.time_evaluator('run',ctx,1,1000) prof_res = np.array(ftimer().results) * 1000 print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) print('total time for 9 images:{}(sec)'.format(end-start)) print('total :{} top1 : {} accu: {}'.format(total,top1,top1/float(total)))
def tune_and_evaluate(tuning_opt): print("Extract tasks...") global net, params, input_shape tasks = autotvm.task.extract_from_program( net, target=target, params=params, ops=(relay.op.nn.conv2d, relay.op.nn.dense, relay.op.nn.bitserial_conv2d, relay.op.nn.bitserial_dense)) # Run tuning tasks. print("Tuning...") tune_kernels(tasks, **tuning_opt) # compile kernels with historgy best records. with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=2): graph, lib, params = relay.build_module.build(net, target=target, params=params) # Upload parameters to device. ctx = tvm.cpu() data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype('float32')) module = runtime.create(graph, lib, ctx) module.set_input('input_1', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=1) prof_res = np.array(ftimer().results) * 1000 # Convert to milliseconds print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_tflite_output_multiplier_greater_than_one(): # uint8 input data_shape = (2, 1, 2, 4) data_dtype = 'uint8' kernel_shape = (3, 1, 2, 2) kernel_dtype = 'uint8' ref_func, qnn_func = get_funcs(data_shape=data_shape, data_dtype=data_dtype, kernel_shape=kernel_shape, kernel_dtype=kernel_dtype, input_zero_point=128, kernel_zero_point=128, kernel_size=(2, 2), padding=(0, 0), strides=(2, 2), dilation=(1, 1), data_layout="NCHW", kernel_layout="OIHW", out_dtype="int32") golden_data = 128 + np.array((1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4)).reshape(data_shape).astype('uint8') golden_weight = 128 + np.array( (1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1)).reshape(kernel_shape) golden_weight = golden_weight.astype('uint8') with relay.build_config(opt_level=2): params = {'kernel': golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() qnn_output = mod.get_output(0).asnumpy() golden_output = np.array( (17, 17, 0, 0, 2, 2, 16, 36, 2, 2, 0, 0)).reshape(2, 3, 1, 2) np.testing.assert_equal(qnn_output, golden_output)
def benchmark(network, target): net, params, input_shape, output_shape = get_network(network, batch_size=1) with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, target=target, params=params) # create runtime ctx = tvm.context(str(target), 0) module = runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat) prof_res = np.array( ftimer().results) * 1000 # multiply 1000 for converting to millisecond print( "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
def verify_multi_c_mod_export(): from shutil import which if which("gcc") is None: print("Skip test because gcc is not available.") for device in ["llvm"]: if not tvm.module.enabled(device): print("skip because %s is not enabled..." % device) return resnet18_mod, resnet18_params = relay.testing.resnet.get_workload( num_layers=18) with relay.build_config(opt_level=3): _, resnet18_cpu_lib, _ = relay.build_module.build( resnet18_mod, "llvm", params=resnet18_params) A = tvm.placeholder((1024, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], "c", name="myadd") engine_module = generate_engine_module() from tvm.contrib import util temp = util.tempdir() file_name = "deploy_lib.so" path_lib = temp.relpath(file_name) resnet18_cpu_lib.import_module(f) resnet18_cpu_lib.import_module(engine_module) kwargs = { "options": ["-O2", "-std=c++11", "-I" + header_file_dir_path.relpath("")] } resnet18_cpu_lib.export_library(path_lib, fcompile=False, **kwargs) loaded_lib = tvm.module.load(path_lib) assert loaded_lib.type_key == "library" assert loaded_lib.imported_modules[0].type_key == "library" assert loaded_lib.imported_modules[1].type_key == "library"
def partition(): data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")) weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32")) bn_gamma = relay.var("bn_gamma", relay.TensorType((16, ), "float32")) bn_beta = relay.var("bn_beta", relay.TensorType((16, ), "float32")) bn_mmean = relay.var("bn_mean", relay.TensorType((16, ), "float32")) bn_mvar = relay.var("bn_var", relay.TensorType((16, ), "float32")) conv = relay.nn.conv2d( data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)) bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar) func = relay.Function([data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple()) mod = tvm.IRModule() mod["main"] = func op_list = ["nn.batch_norm", "nn.conv2d"] mod = WhiteListAnnotator(op_list, "test_compiler")(mod) opt_pass = tvm.transform.Sequential([ transform.InferType(), transform.PartitionGraph(), transform.SimplifyInference(), transform.FoldConstant(), transform.AlterOpLayout(), ]) with relay.build_config(opt_level=3): mod = opt_pass(mod) return mod
def test_tflite_anistropic_strides(): # uint8 input data_shape = (1, 1, 3, 6) data_dtype = 'uint8' kernel_shape = (1, 1, 2, 2) kernel_dtype = 'uint8' ref_func, qnn_func = get_funcs(data_shape=data_shape, data_dtype=data_dtype, kernel_shape=kernel_shape, kernel_dtype=kernel_dtype, input_zero_point=127, kernel_zero_point=127, kernel_size=(2, 2), padding=(0, 0), strides=(1, 3), dilation=(1, 1), data_layout="NCHW", kernel_layout="OIHW", out_dtype="int32") golden_data = np.array( (133, 131, 129, 125, 123, 121, 135, 133, 131, 123, 121, 119, 137, 135, 133, 121, 119, 117)).reshape(data_shape) golden_data = golden_data.astype('uint8') golden_weight = np.array((129, 131, 133, 135)).reshape(kernel_shape) golden_weight = golden_weight.astype('uint8') with relay.build_config(opt_level=2): params = {'kernel': golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() qnn_output = mod.get_output(0).asnumpy() golden_output = np.array((124, -92, 164, -132)).reshape(1, 1, 2, 2) np.testing.assert_equal(qnn_output, golden_output)
def test_dynamic_dequantize(): x = relay.var("x", shape=(1, 2, 3, 4), dtype="int8") scale_var = relay.var("scale", shape=(), dtype="float32") zp_var = relay.var("zp", shape=(), dtype="int32") deq_x = relay.qnn.op.dequantize(x, scale_var * scale_var, zp_var + zp_var) tt = run_infer_type(deq_x) assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "float32") func = relay.Function([x, scale_var, zp_var], deq_x) data = np.random.uniform(size=(1, 2, 3, 4)).astype("int8") scale = np.array(1).astype("float32") zp = np.array(0).astype("int32") mod = tvm.ir.IRModule.from_expr(func) for target, dev in tvm.testing.enabled_targets(): # TODO: (electriclilies) enable AlterOpLayout when it is fixed with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]): lib = relay.build(mod, target=target) module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input(**{"x": data, "scale": scale, "zp": zp}) module.run()
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=(relay.op.nn.conv2d, )) # run tuning tasks print("Tuning...") tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) # compile kernels with graph-level best records with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(mod, target=target, params=params) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array( (np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program(net, target=target, params=params, ops=(relay.op.nn.conv2d,)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( net, target=target, params=params) # export library tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) # load parameters ctx = tvm.device(str(target), 0) module = runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_broadcast_layout(): with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d): # Test broadcast support for NHWC layout. data_shape = (1, 229, 229, 3) # NHWC data_dtype = 'uint8' kernel_shape = (7, 7, 3, 64) # HWIO kernel_dtype = 'int8' _, qnn_func = get_funcs(data_shape=data_shape, data_dtype=data_dtype, kernel_shape=kernel_shape, kernel_dtype=kernel_dtype, input_zero_point=8, kernel_zero_point=3, input_scale=1.0, kernel_scale=1.0, kernel_size=(7, 7), padding=(1, 1), strides=(1, 1), dilation=(1, 1), data_layout="NHWC", kernel_layout="HWIO", out_dtype="int32") func = qnn_func['main'].body bias = relay.var("bias", shape=(64,), dtype="int32") bias2 = relay.var("bias2", shape=(1, 225, 225, 1), dtype="int32") # Check broadcast support on both lhs and rhs func = relay.add(func, bias2) func = relay.add(bias2, func) func = relay.add(bias, func) func = relay.add(func, bias) func = relay.Function(relay.analysis.free_vars(func), func) mod = tvm.IRModule.from_expr(func) with relay.build_config(opt_level=3): graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512")
def test_tflite_large_irregular(): with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d): # uint8 input data_shape = (1, 1024, 1, 1) data_dtype = 'uint8' kernel_shape = (1001, 1024, 1, 1) kernel_dtype = 'uint8' ref_func, qnn_func = get_funcs(data_shape=data_shape, data_dtype=data_dtype, kernel_shape=kernel_shape, kernel_dtype=kernel_dtype, input_zero_point=127, kernel_zero_point=127, input_scale=1.0, kernel_scale=1.0, kernel_size=(1, 1), padding=(0, 0), strides=(1, 1), dilation=(1, 1), data_layout="NCHW", kernel_layout="OIHW", out_dtype="int32") golden_data = np.full(data_shape, 127).astype('uint8') golden_weight = np.full(kernel_shape, 127).astype('uint8') with relay.build_config(opt_level=2): params = {'kernel': golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() qnn_output = mod.get_output(0).asnumpy() golden_output = np.full((1, 1001, 1, 1), 0).astype('uint8') np.testing.assert_equal(qnn_output, golden_output)
def quantize_test_driver(in_dtype, quant_args, out_dtype, in_data, verify_output_data): shape = in_data.shape input_data = relay.var("input_data", shape=shape, dtype=in_dtype) output_zero_point = quant_args['out_zero_point'] output_scale = quant_args['out_scale'] quantized_output = relay.qnn.op.quantize( input_data, output_scale=output_scale, output_zero_point=output_zero_point, out_dtype=out_dtype) mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output) mod = relay.Module.from_expr(mod) mod = relay.qnn.transform.CanonicalizeOps()(mod) with relay.build_config(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() res = rt_mod.get_output(0).asnumpy() np.testing.assert_equal(res, verify_output_data) assert res.dtype == out_dtype
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program(mod, params=params, ops=(tvm.relay.op.nn.conv2d, ), target=target, target_host=env.target_host) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
'n_trial': 1e9, 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), runner=autotvm.RPCRunner(env.TARGET, tracker_host, tracker_port, number=4, min_repeat_ms=150, repeat=opt.measurements, timeout=60, check_correctness=True)) } tune_tasks(tasks, **tuning_opt) # Compile kernels with history best records with autotvm.tophub.context(target, extra_files=[opt.log_filename]): # Compile network print("Compiling network with best tuning parameters...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o")
def compile(target): net, params = relay.frontend.from_mxnet(block, {"data": dshape}) with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, target, params=params) return graph, lib, params
def compile(info): if info['model_path'].endswith('.onnx'): is_onnx = True elif info['model_path'].endswith('.pb'): is_onnx = False else: raise Exception('Model file format not supported') # Load model if is_onnx: onnx_model = onnx.load(info['model_path']) mod, params = relay.frontend.from_onnx(onnx_model, info['input_dict']) optimization_level = 3 else: with tf.compat.v1.Session() as sess: with tf.io.gfile.GFile(info['model_path'], 'rb') as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def, name='') graph_def = sess.graph.as_graph_def() graph_def = tf_testing.ProcessGraphDefParam(graph_def) input_shape_dict = {'DecodeJpeg/contents': info['input_list']} mod, params = relay.frontend.from_tensorflow( graph_def, shape=input_shape_dict, outputs=info['output_names']) optimization_level = 2 # Set compilation params target = 'llvm' if info['cross_compile']: target += ' -target=aarch64-linux-gnu' # Compile model # Note opt_level cannot be higher than 2 because of a bug: # https://discuss.tvm.ai/t/tvm-0-6-1-compile-yolo-v2-tiny-fail-worked-in-v0-5-2/7244 with relay.build_config(opt_level=optimization_level): graph, lib, params = relay.build(mod, target=target, params=params) # Write the compiled model to files output_model_path = path.join(info['output_path'], OUTPUT_NETWORK_MODULE_FILENAME) output_graph_path = path.join(info['output_path'], OUTPUT_NETWORK_GRAPH_FILENAME) output_param_path = path.join(info['output_path'], OUTPUT_NETWORK_PARAM_FILENAME) print('Writing library to', output_model_path) if info['cross_compile']: lib.export_library( output_model_path, cc.build_create_shared_func(options=[ '--target=aarch64-linux-gnu', '-march=armv8-a', '-mfpu=NEON' ], compile_cmd='/usr/bin/clang')) else: lib.export_library(output_model_path) print('Writing graph to', output_graph_path) with open(output_graph_path, 'w') as graph_file: graph_file.write(graph) print('Writing weights to', output_param_path) with open(output_param_path, 'wb') as param_file: param_file.write(relay.save_param_dict(params))
def export_tvm(path, block, data_shape, epoch=0, preprocess=True, layout='HWC', ctx=mx.cpu(), target='llvm', opt_level=3, use_autotvm=False): """Helper function to export a HybridBlock to TVM executable. Note that tvm package needs to be installed(https://tvm.ai/). Parameters ---------- path : str Path to save model. Three files path_deploy_lib.tar, path_deploy_graph.json and path_deploy_xxxx.params will be created, where xxxx is the 4 digits epoch number. block : mxnet.gluon.HybridBlock The hybridizable block. Note that normal gluon.Block is not supported. data_shape : tuple of int, required Unlike `export_block`, `data_shape` is required here for the purpose of optimization. If dynamic shape is required, you can use the shape that most fits the inference tasks, but the optimization won't accommodate all situations. epoch : int Epoch number of saved model. preprocess : mxnet.gluon.HybridBlock, default is True. Preprocess block prior to the network. By default (True), it will subtract mean [123.675, 116.28, 103.53], divide std [58.395, 57.12, 57.375], and convert original image (B, H, W, C and range [0, 255]) to tensor (B, C, H, W) as network input. This is the default preprocess behavior of all GluonCV pre-trained models. You can use custom pre-process hybrid block or disable by set ``preprocess=None``. layout : str, default is 'HWC' The layout for raw input data. By default is HWC. Supports 'HWC' and 'CHW'. Note that image channel order is always RGB. ctx: mx.Context, default mx.cpu() Network context. target : str, default is 'llvm' Runtime type for code generation, can be ('llvm', 'cuda', 'opencl', 'metal'...) opt_level : int, default is 3 TVM optimization level, if supported, higher `opt_level` may generate more efficient runtime library, however, some operator may not support high level optimization, which will fallback to lower `opt_level`. use_autotvm : bool, default is False Use autotvm for performance tuning. Note that this can take very long time, since it's a search and model based tuning process. Returns ------- None """ try: import tvm from tvm import autotvm from tvm import relay from tvm.relay import testing from tvm.autotvm.tuner import XGBTuner, RandomTuner import tvm.contrib.graph_runtime as runtime except ImportError: print( "TVM package required, please refer https://tvm.ai/ for installation guide." ) raise # add preprocess block if necessary if preprocess: # add preprocess block if preprocess is True: preprocess = _DefaultPreprocess() else: if not isinstance(preprocess, HybridBlock): raise TypeError( "preprocess must be HybridBlock, given {}".format( type(preprocess))) wrapper_block = nn.HybridSequential() preprocess.initialize(ctx=ctx) wrapper_block.add(preprocess) wrapper_block.add(block) else: wrapper_block = block wrapper_block.collect_params().reset_ctx(ctx) # convert to relay graph sym, params = relay.frontend.from_mxnet(wrapper_block, shape={"data": data_shape}) if use_autotvm: def tune_kernels(tasks, measure_option, tuner='gridsearch', early_stopping=None, log_filename='tuning.log'): for i, tsk in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # converting conv2d tasks to conv2d_NCHWc tasks op_name = tsk.workload[0] if op_name == 'conv2d': func_create = 'topi_x86_conv2d_NCHWc' elif op_name == 'depthwise_conv2d_nchw': func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw' else: raise ValueError( "Tuning {} is not supported on x86".format(op_name)) task = autotvm.task.create(func_create, args=tsk.args, target=target, template_key='direct') task.workload = tsk.workload # create tuner if tuner in ('xgb', 'xgb-rank'): tuner_obj = XGBTuner(task, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(task, pop_size=50) elif tuner == 'random': tuner_obj = RandomTuner(task) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(task) else: raise ValueError("Invalid tuner: " + tuner) # do tuning n_trial = len(task.config_space) tuner_obj.tune(n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar( n_trial, prefix=prefix), autotvm.callback.log_to_file(log_filename) ]) # tasks = autotvm.task.extract_from_program(sym, target=target, params=params, ops=(relay.op.nn.conv2d, )) logging.warning('Start tunning, this can be slow...') tuning_option = { 'log_filename': 'tune.log', 'tuner': 'random', 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), ), } tune_kernels(tasks, **tuning_option) with autotvm.apply_history_best(log_file): with relay.build_config(opt_level=opt_level): graph, lib, params = relay.build_module.build(sym, target=target, params=params) else: with relay.build_config(opt_level=opt_level): graph, lib, params = relay.build_module.build(sym, target, params=params) # export library, json graph and parameters lib.export_library(path + '_deploy_lib.so') with open(path + '_deploy_graph.json', 'w') as fo: fo.write(graph) with open(path + '_deploy_{:04n}.params'.format(epoch), 'wb') as fo: try: fo.write(relay.compiler.save_param_dict(params)) except AttributeError: fo.write(relay.save_param_dict(params))
def test_alter_layout_strided_slice(): """Test rewriting strided_slice during alter_iop_layout""" def before(): x = relay.var("x", shape=(1, 32, 28, 28)) weight = relay.var('weight', shape=(32, 32, 3, 3)) y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1)) y = relay.strided_slice(y, begin=relay.const([0, 16], "int32"), end=relay.const([1, 33], "int32"), strides=relay.const([1, 1], "int32")) y = relay.Function(analysis.free_vars(y), y) return y def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW4c' return relay.nn.conv2d(data, weight, **new_attrs) def expected(): x = relay.var("x", shape=(1, 32, 28, 28)) weight = relay.var("weight", shape=(32, 32, 3, 3)) weight = relay.layout_transform(weight, "OIHW", "OIHW4i4o") x = relay.layout_transform(x, "NCHW", "NCHW4c") y = relay.op.nn.contrib_conv2d_nchwc(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW4c") y = relay.strided_slice(y, begin=relay.const([0, 4], "int32"), end=relay.const([1, 21], "int32"), strides=relay.const([1, 1], "int32")) y = relay.layout_transform(y, "NCHW4c", "NCHW") y = relay.Function(analysis.free_vars(y), y) return y with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): a = before() b = run_opt_pass(expected(), transform.InferType()) # Verify inference result mod_before = tvm.IRModule() mod_new = tvm.IRModule() mod_before['main'] = a mod_new['main'] = b with relay.build_config(opt_level=3): for target, ctx in ctx_list(): for kind in ["graph", "debug", "vm"]: ex_before = relay.create_executor(kind, mod=mod_before, ctx=ctx, target=target) ex_new = relay.create_executor(kind, mod=mod_new, ctx=ctx, target=target) np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32") np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32") result_before = ex_before.evaluate()(np_data, np_weight) result_new = ex_new.evaluate()(np_data, np_weight) tvm.testing.assert_allclose(result_before.asnumpy(), result_new.asnumpy(), rtol=1e-5, atol=1e-5)
img_path = download_testdata(img_url, 'cat.png', module='data') img = Image.open(img_path).resize((224, 224)) img_ycbcr = img.convert("YCbCr") # convert to YCbCr img_y, img_cb, img_cr = img_ycbcr.split() x = np.array(img_y)[np.newaxis, np.newaxis, :, :] ###################################################################### # Compile the model with relay # --------------------------------------------- target = 'llvm' input_name = '1' shape_dict = {input_name: x.shape} sym, params = relay.frontend.from_onnx(onnx_model, shape_dict) with relay.build_config(opt_level=1): intrp = relay.build_module.create_executor('graph', sym, tvm.cpu(0), target) ###################################################################### # Execute on TVM # --------------------------------------------- dtype = 'float32' tvm_output = intrp.evaluate(sym)(tvm.nd.array(x.astype(dtype)), **params).asnumpy() ###################################################################### # Display results # --------------------------------------------- # We put input and output image neck to neck from matplotlib import pyplot as plt out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode='L') out_cb = img_cb.resize(out_y.size, Image.BICUBIC)
def check_function(symbol, forward=None, backward=None, grad_input_vars=None, shape=None, dtype=None, in_range=None, values=None, exclude_targets=None, only_targets=None, additional_params=None, numerical_grads=None, numerical_grads_params=None, atol=1e-5, rtol=1e-5, quiet=False): """Compute the function and/or its gradients on a random input and raise an exception if the result doesn't match the reference implementation. Parameters ---------- symbol : nnvm.Symbol A symbol representing the output. forward : Callable[..., List[numpy.ndarray]], optional A reference implementation to compare with. backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional A reference implementation of gradients. Should also accept head_grads besides normal inputs which is a list of gradients of some scalar wrt the outputs or just a single gradient if there are multiple outputs. Should return either a dict mapping input variable names to the respective gradients or a list of gradients wrt variables from grad_input_vars in exactly the same order (in alphabetical order by default). grad_input_vars : List[nnvm.Symbol or str], optional A list of variables with respect to which the gradients will be computed. None (default) means that all input variables will be used in an alphabetical order. shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional A dict mapping input variable names to shapes, or just a single shape. By default shapes will be inferred from variables' attributes (see the Examples). Note that this parameter takes precedence over variables' attributes. dtype : Dict[nnvm.Symbol or str, str] or str, optional A dict mapping input variable names to dtypes, or just a single dtype. By default dtypes will be inferred from variables' attributes (see the Examples). If dtypes cannot be inferred for some variables then float32 will be used as a fallback. Note that this parameter takes precedence over variables' attributes. in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional A dict mapping input variable names to ranges or just a single range (the same for all variables). Input values will be generated from uniform distributions on these ranges. `head_grads` can also be assigned a range this way. values : Dict[nnvm.Symbol or str, numpy.ndarray], optional A dict explicitly providing values for some variables instead of random generation. exclude_targets : Set[str], optional Skip compiling and running anything for these targets. only_targets : Set[str], optional Test only for those targets from `ctx_list()` that are also in this set. additional_params : dict, optional A dict of additional parameters which will be passed to forward and backward. numerical_grads : bool or 'if_possible', optional Whether to additionally check against numerically computed gradients. If 'if_possible' or None is passed (which is the default) then it will try to create a gradient computation graph and then check gradients numerically only if this graph can be created (i.e. if there are some operations with unimplemented gradients, it will just issue a warning). Checking against numerical gradients is done via the `check_numerical_grads` function. numerical_grads_params : dict, optional Additional parameters for `check_numerical_grads`. atol : float, optional Absolute tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients. rtol : float, optional Relative tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients. quiet : bool, optional Don't dump additional information to stdout on failure. Examples -------- .. code-block:: python x = sym.Variable("x", shape=(1, 2)) y = sym.Variable("y", shape=(1, 2)) # check the function and its gradients both numerically and using a reference function check_function(x + 2*y, lambda x, y: x + 2*y, lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads}) # just check gradients numerically check_function(x + 2*y, numerical_grads=True) # just check the forward computation check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False) # specifying dtype check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64') # dtypes can also be specified during variable creation with dtype codes x = sym.Variable("x", dtype=0) check_function(x + 1, shape=(2, 2), numerical_grads=True) """ # validate and preprocess the input params if numerical_grads is None and forward is None and backward is None: raise ValueError("No reference function was passed to check_function. If you only want to " "check gradients numerically, pass numerical_grads=True explicitly.") if numerical_grads is None: numerical_grads = 'if_possible' if numerical_grads not in [False, True, 'if_possible']: raise ValueError("numerical_grads must be a bool or 'if_possible', not {}" .format(numerical_grads)) if additional_params is None: additional_params = {} input_vars = symbol.list_input_variables() input_dict = {x.attr('name'): x for x in input_vars} if grad_input_vars is None: grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name')) else: grad_input_vars = [input_dict[x] if isinstance(x, str) else x for x in grad_input_vars] in_range = _dict_var_to_dict_str(in_range) values = _dict_var_to_dict_str(values) out_len = len(symbol.list_output_names()) # Infer the output shapes and dtypes, and preprocess the shape and dtype params forward_graph, shape, dtype, out_shapes, out_dtypes = \ infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype, fallback_dtype='float32') if not all(out_shapes) or not all(out_dtypes): if not quiet: print(forward_graph.ir(join_node_attrs=['shape', 'dtype'])) raise ValueError("Could not infer shapes or dtypes for outputs.\n" "out_shapes = {}\nout_dtypes = {}".format(out_shapes, out_dtypes)) backward_graph = None # If we want gradients, we have to recreate the graph, but now with gradient computations # Note that here we need out_shapes for defining the shape of head grads, so we have to # create the graph twice if backward is not None or numerical_grads: try: head_grads_symbols = [nnvm.symbol.Variable("head_grads_" + str(i), shape=out_shapes[i], dtype=DTYPE_TO_TCODE[out_dtypes[i]]) for i in range(out_len)] grad_symbols = graph_util.gradients([symbol], grad_input_vars, grad_ys=head_grads_symbols) # Sometimes grads do not depend on head_grads, so head_grads does not appear # in the variable list; adding it manually prevents this, making things a bit easier backward_graph = \ nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols)) backward_graph, shape, dtype, out_shapes, out_dtypes = \ infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype, fallback_dtype='float32') except nnvm._base.NNVMError as err: if backward is None and numerical_grads == "if_possible": logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0]) numerical_grads = False backward_graph = None else: raise main_graph = backward_graph if backward_graph is not None else forward_graph # Generate random data for inputs (including head_grads) np_inputs = {} for x in main_graph.symbol.list_input_variables(): x_name = x.attr('name') x_shape = shape[x_name] x_dtype = dtype[x_name] if values is not None and x_name in values: np_inputs[x_name] = values[x_name].astype(x_dtype) continue low = -1.0 high = 1.0 if in_range is not None: if isinstance(in_range, dict): if x_name in in_range: low = in_range[x_name][0] high = in_range[x_name][1] else: low = in_range[0] high = in_range[1] np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype) np_inputs_without_head_grads = {k: np_inputs[k] for k in np_inputs if not k.startswith('head_grads_')} nothing_was_done = True # Compute and compare the results for target, ctx in ctx_list(): if exclude_targets is not None: if target in exclude_targets or str(target) in exclude_targets: logging.info("Skipping target = %s, ctx = %s", target, ctx) continue if only_targets is not None: if target not in only_targets and str(target) not in only_targets: logging.info("Skipping target = %s, ctx = %s", target, ctx) continue logging.info("Checking computation on target = %s, ctx = %s", target, ctx) debug_stage = None try: nnvm_res = None debug_stage = "compiling" main_function = graph_to_function(main_graph, target, ctx) # nnvm_res contains the output and gradients (if they are needed) debug_stage = "running" nnvm_res = main_function(**np_inputs) try: logging.debug("checking to_relay conversion") inputs = np_inputs_without_head_grads.copy() func, inputs = to_relay(main_graph, shape, dtype, params=inputs) with relay.build_config(opt_level=3): graph, lib, params = relay.build(func, target=target) m = graph_runtime.create(graph, lib, ctx) m.set_input(**inputs) m.set_input(**params) m.run() for i in range(out_len): relay_out = m.get_output(i).asnumpy() tvm.testing.assert_allclose(nnvm_res[i], relay_out, atol=atol, rtol=rtol) except NotImplementedError as err: # the NNVM operator is not supported yet logging.warning(err) if backward_graph is not None: grad_var_names = [x.attr('name') for x in grad_input_vars] nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])} if forward is not None: nothing_was_done = False debug_stage = "checking forward computation" logging.debug(debug_stage) params = {} params.update(np_inputs_without_head_grads) params.update(additional_params) numpy_res = forward(**params) if isinstance(numpy_res, tuple): numpy_res = list(numpy_res) if not isinstance(numpy_res, list): numpy_res = [numpy_res] if len(numpy_res) != out_len: raise ValueError("Forward function returned {} values, but " "the nnvm graph returns {} values" .format(len(numpy_res), out_len)) for i in range(out_len): tvm.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol) if backward is not None: nothing_was_done = False debug_stage = "checking gradients" logging.debug(debug_stage) np_head_grads = [np_inputs["head_grads_" + str(i)] for i in range(out_len)] if out_len == 1: np_head_grads = np_head_grads[0] params = {'head_grads': np_head_grads} params.update(np_inputs_without_head_grads) params.update(additional_params) numpy_grads = backward(**params) if not isinstance(numpy_grads, dict): if isinstance(numpy_grads, tuple): numpy_grads = list(numpy_grads) if not isinstance(numpy_grads, list): numpy_grads = [numpy_grads] numpy_grads = {x: v for x, v in zip(grad_var_names, numpy_grads)} if len(numpy_grads) != len(grad_var_names): raise ValueError("The backward function returns a list of gradients which " "does not contain gradients for these variables: {}" .format(set(grad_var_names) - set(numpy_grads))) for x_name in numpy_grads: tvm.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name], atol=atol, rtol=rtol) if numerical_grads: nothing_was_done = False debug_stage = "checking gradients numerically" logging.debug(debug_stage) forward_function = graph_to_function(forward_graph, target, ctx) # Since the result may be non-scalar, we have to put another operation on the top, # so we just multiple by the randomly generated head_grads and then sum everything. # This way we can reuse the gradient values which has been already computed. def scalar_function(**kwargs): res = forward_function(**kwargs) return np.sum([np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel()) for i in range(out_len)]) if numerical_grads_params is None: numerical_grads_params = {} check_numerical_grads( scalar_function, input_values=np_inputs_without_head_grads, grad_values=nnvm_grads, **numerical_grads_params) except: if not quiet: print("\ncheck_function failed while {}, here is the main graph" .format(debug_stage)) print(main_graph.ir(join_node_attrs=['shape', 'dtype'])) if nnvm_res is not None: print("Generated inputs:") print(np_inputs) print() raise if nothing_was_done: logging.warning("Nothing was done in check_function. Check ctx_list().")
def compile_model(self): if device == 'vta': self.remote = rpc.connect(self.pynq_addr, 9091) vta.reconfig_runtime(self.remote) vta.program_fpga(self.remote, bitstream=None) else: self.remote = rpc.LocalSession() self.ctx = self.remote.ext_dev( 0) if device == 'vta' else self.remote.cpu(0) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {'data': 'float32'} shape_dict = {'data': (env.BATCH, 3, 224, 224)} gluon_model = vision.resnet18_v1( pretrained=True, ctx=ctx ).features if args.nonsplit else splitnet.resnet18_v1_split( self.id + 1) # Measure build start time build_start = time.time() # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == 'vta': assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name != 'vta': graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) self.params = params # Measure Relay build time build_time = time.time() - build_start print(f'inference graph for thread {self.id} built in {0:.4f}s!'. format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) self.remote.upload(temp.relpath('graphlib.o')) lib = self.remote.load_module('graphlib.o') # Graph runtime self.m = graph_runtime.create(graph, lib, self.ctx)
debug_unit=args.debug_unit) # print(func.astext()) ############################################################################### # Tuning # ----------------- tuning_enable = args.tuning_enable # log_filename = "./mixed_precision_models/tuning_logs/resnet%d_%s_%s_batch_%d.log" % (num_layers, data_layout, model_type, batch_size) log_filename = "./mixed_precision_models/tuning_logs/resnet%d_%s_mixed_batch_%d.log" % ( num_layers, data_layout, batch_size) tmp_log_file = log_filename + '.temp' if tuning_enable: print("Extracting tasks ...") with relay.build_config(opt_level=3): tasks = autotvm.task.extract_from_program(func, target=TARGET_NAME, params=params) print(tasks) measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=20, repeat=3, min_repeat_ms=150) # runner=autotvm.RPCRunner( # 'T4', # change the device key to your key # '0.0.0.0', 9190, # number=20, repeat=3, min_repeat_ms=150), )
# for this graph on the target hardware, and the parameter blobs of # the model. During the compilation, Relay does the graph-level # optimization while TVM does the tensor-level optimization, resulting # in an optimized runtime module for model serving. # # We'll first compile for Nvidia GPU. Behind the scene, `relay.build_module.build` # first does a number of graph-level optimizations, e.g. pruning, fusing, etc., # then registers the operators (i.e. the nodes of the optimized graphs) to # TVM implementations to generate a `tvm.module`. # To generate the module library, TVM will first transfer the high level IR # into the lower intrinsic IR of the specified target backend, which is CUDA # in this example. Then the machine code will be generated as the module library. opt_level = 3 target = tvm.target.cuda() with relay.build_config(opt_level=opt_level): graph, lib, params = relay.build_module.build(mod, target, params=params) ##################################################################### # Run the generate library # ------------------------ # Now we can create graph runtime and run the module on Nvidia GPU. # create random input ctx = tvm.gpu() data = np.random.uniform(-1, 1, size=data_shape).astype("float32") # create module module = graph_runtime.create(graph, lib, ctx) # set input and parameters module.set_input("data", data) module.set_input(**params)
def run_unpropagatable_graph(dev, tgt): R""" The network is as following: a b c d \ / \ / add mul \ / subtract """ a = relay.var("a", shape=(10, 10)) b = relay.var("b", shape=(10, 10)) c = relay.var("c", shape=(10, 10)) d = relay.var("d", shape=(10, 10)) a_data = np.random.rand(10, 10).astype('float32') b_data = np.random.rand(10, 10).astype('float32') c_data = np.random.rand(10, 10).astype('float32') d_data = np.random.rand(10, 10).astype('float32') tmp_add = a_data + b_data tmp_mul = np.multiply(c_data, d_data) ref_res = np.subtract(tmp_add, tmp_mul) fallback_device = tvm.context("cpu") target = {"cpu": "llvm", dev: tgt} cpu_ctx = fallback_device dev_ctx = tvm.context(dev) def annotated(): add = relay.add(a, b) _add = relay.annotation.on_device(add, dev_ctx) mul = relay.multiply(c, d) _mul = relay.annotation.on_device(mul, cpu_ctx) sub = relay.subtract(add, mul) _sub = relay.annotation.on_device(sub, dev_ctx) func = relay.Function([a, b, c, d], relay.Tuple(tvm.convert([_add, _mul, _sub, sub]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, dev_ctx.device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[3]), func.body[3]) def expected(): add = relay.add(a, b) mul = relay.multiply(c, d) copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx) sub = relay.subtract(add, copy_mul_sub) func = relay.Function([a, b, c, d], sub) return func annotated_func = annotated() expected_func = expected() expected_index = [2, 2, 2, 1, 1, 1, 2, 2] check_annotated_graph(annotated_func, expected_func) params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data} config = {"opt_level": 0} config["fallback_device"] = fallback_device with relay.build_config(**config): graph, lib, params = relay.build(annotated_func, target, params=params) contexts = [tvm.cpu(0), tvm.context(dev)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index mod = graph_runtime.create(graph, lib, contexts) mod.set_input(**params) mod.run() res = mod.get_output(0).asnumpy() tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)