def check_verify(): if not tvm.runtime.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.set_input(x=a) #verify dumproot created directory = mod._dump_path assert(os.path.exists(directory)) #verify graph is there GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json' assert(len(os.listdir(directory)) == 1) #verify the file name is proper graph_dump_path = os.path.join(directory, GRAPH_DUMP_FILE_NAME) assert(os.path.exists(graph_dump_path)) # verify the graph contains some expected keys with open(graph_dump_path) as graph_f: dumped_graph = json.load(graph_f) assert isinstance(dumped_graph, dict) for k in ("nodes", "arg_nodes", "node_row_ptr", "heads", "attrs"): assert k in dumped_graph, f"key {k} not in dumped graph {graph!r}" mod.run() #Verify the tensors are dumped assert(len(os.listdir(directory)) > 1) CHROME_TRACE_FILE_NAME = '_tvmdbg_execution_trace.json' assert(os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME))) with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f: trace = json.load(f) assert trace["displayTimeUnit"] == "ns" events = trace["traceEvents"] assert len(events) == 4 assert all(event["ph"] in ('B', 'E') for event in events) assert all(event["pid"] == 1 for event in events) assert all(event["tid"] == 1 for event in events) assert all(event["name"] == 'x' for event in events[:2]) assert all(event["name"] == 'add' for event in events[2:]) assert events[0]["ts"] == 0 assert events[0]["ph"] == 'B' #verify the output is correct out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1) mod.exit() #verify dump root delete after cleanup assert(not os.path.exists(directory))
def run(args): onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx')) symbol, params = nnvm.frontend.from_onnx(onnx_model) input_names = symbol.list_input_names() output_names = symbol.list_output_names() test_data_dir = os.path.join(args.test_dir, 'test_data_set_0') inputs, outputs = load_test_data(test_data_dir, input_names, output_names) inputs = dict(inputs) # assert len(input_names) == len(inputs) + len(params) # assert len(output_names) == len(outputs) graph, lib, params = compile( symbol, args.target, input_names, inputs, params, args.opt_level, args.autotvm_log) if args.dump_nnvm: print(graph.ir()) print(graph.json()) ctx = tvm.gpu() # Prepare inputs. tvm_inputs = {} for name, value in inputs.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) for name, value in params.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) graph_module = None if args.debug: try: graph_module = debug_runtime.create(graph, lib, ctx) except: print('debug_runtime is disabled. ' 'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM') if graph_module is None: graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**tvm_inputs) graph_module.run() for i, (name, expected) in enumerate(outputs): tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx) actual = graph_module.get_output(i, tvm_output).asnumpy() np.testing.assert_allclose(expected, actual, rtol=1e-3, atol=1e-4), name print('%s: OK' % name) print('ALL OK') if args.iterations > 1: num_iterations = args.iterations - 1 start = time.time() for t in range(num_iterations): graph_module.run() cupy.cuda.device.Device().synchronize() elapsed = time.time() - start print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n, )).astype(A.dtype) mod.set_input(x=a) #verify dumproot created directory = mod._dump_path assert (os.path.exists(directory)) #verify graph is there GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json' assert (len(os.listdir(directory)) == 1) #verify the file name is proper assert (os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME))) mod.run() #Verify the tensors are dumped assert (len(os.listdir(directory)) > 1) #verify the output is correct out = mod.get_output(0, tvm.nd.empty((n, ))) np.testing.assert_equal(out.asnumpy(), a + 1) mod.exit() #verify dump root delete after cleanup assert (not os.path.exists(directory))
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.set_input(x=a) #verify dumproot created directory = mod._dump_path assert(os.path.exists(directory)) #verify graph is there GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json' assert(len(os.listdir(directory)) == 1) #verify the file name is proper assert(os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME))) mod.run() #Verify the tensors are dumped assert(len(os.listdir(directory)) > 1) #verify the output is correct out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1) mod.exit() #verify dump root delete after cleanup assert(not os.path.exists(directory))
def run(): passes = [(1, tensorizer.rewrite)] config = { 'tir.add_lower_pass': passes } if target.startswith('nvptx') else {} with tvm.transform.PassContext(opt_level=3, trace=tracer, config=config): graph, lib, params = tvm.relay.build(module, target=target) #from tvm.contrib import graph_runtime as runtime from tvm.contrib.debugger import debug_runtime as runtime func = runtime.create(graph, lib, tvm.gpu()) x_ = (np.random.randn(n, c, h, w) * 128).astype('float32') func.set_input('x', x_) timer = func.module.time_evaluator('run', ctx=tvm.gpu(), number=1, repeat=10) #timed = [] #for i in range(10): # func.run() # for node, time in zip(func.debug_datum._nodes_list, func.debug_datum._time_list): # if 'conv2d' in node['name']: # timed.append(time[0]) timed = timer() while np.var(timed.results) > 1e-5: timed = timer() return timed.mean
def run(args): onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx')) symbol, params = nnvm.frontend.from_onnx(onnx_model) input_names = symbol.list_input_names() output_names = symbol.list_output_names() test_data_dir = os.path.join(args.test_dir, 'test_data_set_0') inputs, outputs = load_test_data(test_data_dir, input_names, output_names) inputs = dict(inputs) # assert len(input_names) == len(inputs) + len(params) # assert len(output_names) == len(outputs) graph, lib, params = compile(symbol, args.target, input_names, inputs, params, args.opt_level, args.autotvm_log) if args.dump_nnvm: print(graph.ir()) print(graph.json()) ctx = tvm.gpu() # Prepare inputs. tvm_inputs = {} for name, value in inputs.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) for name, value in params.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) graph_module = None if args.debug: try: graph_module = debug_runtime.create(graph, lib, ctx) except: print('debug_runtime is disabled. ' 'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM') if graph_module is None: graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**tvm_inputs) graph_module.run() for i, (name, expected) in enumerate(outputs): tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx) actual = graph_module.get_output(i, tvm_output).asnumpy() np.testing.assert_allclose(expected, actual, rtol=1e-3, atol=1e-4), name print('%s: OK' % name) print('ALL OK') if args.iterations > 1: num_iterations = args.iterations - 1 start = time.time() for t in range(num_iterations): graph_module.run() cupy.cuda.device.Device().synchronize() elapsed = time.time() - start print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
def profile(symbol_file, num_inference_images): debug = False import tvm from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime as debug_runtime base = os.getcwd() + '/compiled_models/tvm_' + symbol_file.split( '/')[-1].replace('.json', '') path_lib = base + '_deploy_lib.tar' path_graph = base + '_deploy_graph.json' path_params = base + '_deploy_params.params' graph = open(path_graph).read() lib = tvm.runtime.load_module(path_lib) params = bytearray(open(path_params, 'rb').read()) if debug: rt_mod = debug_runtime.create(graph, lib, ctx=tvm.cpu(0)) rt_mod.load_params(params) rt_mod.run() return rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) rt_mod.load_params(params) # warm up warm_up = 0 for i in range(0, 50): rt_mod.run() warm_up += 1 if warm_up == 50: break counter = 0 time_tvm = list() for i in range(0, num_inference_images): time0 = time.time() rt_mod.run() time1 = time.time() time_tvm.append(time1 - time0) counter += 1 if counter == num_inference_images: break avg = lambda x: round(1000 * sum(x) / len(x), 6) std = lambda x: round(statistics.stdev(x), 6) total_tvm = avg(time_tvm) sec_tvm = total_tvm / 1000 std_tvm = std(time_tvm) min_tvm = round(min(time_tvm), 6) min_tvm_ms = round(min(time_tvm) * 1000, 6) deviation_from_min_tvm = round(sec_tvm / min_tvm * 100 - 100, 6) deviation_from_std_tvm = round(std_tvm / sec_tvm * 100, 6) net_name = symbol_file.split('/')[-1].replace('.json', '') print("Perf", "Tvm", net_name, total_tvm, min_tvm_ms, std_tvm, sep='\t')
def profile(num_inference_images, prefix): debug = True # np.random.seed(0) static_net = mx.gluon.SymbolBlock.imports('{}.json'.format(prefix), ['data0', 'data1', 'data2'], '{}.params'.format(prefix)) static_net.hybridize(static_alloc=True, static_shape=True) mx_ctx = mx.cpu() # Prepare input data dtype = "float32" batch = 1 seq_length = 128 inputs = np.random.randint(0, 2000, size=(batch, seq_length)).astype(dtype) token_types = np.random.uniform(size=(batch, seq_length)).astype(dtype) valid_length = np.asarray([seq_length] * batch).astype(dtype) # Convert to MXNet NDArray and run the MXNet model inputs_nd = mx.nd.array(inputs, ctx=mx_ctx) token_types_nd = mx.nd.array(token_types, ctx=mx_ctx) valid_length_nd = mx.nd.array(valid_length, ctx=mx_ctx) mx_out = static_net(inputs_nd, token_types_nd, valid_length_nd.astype('float32')) mx_out.wait_to_read() print(mx_out) import tvm if debug: from tvm.contrib.debugger import debug_runtime as grt else: from tvm.contrib import graph_runtime as grt base = os.getcwd() + '/compiled/' + prefix.split("/")[-1] path_lib = base + '_deploy_lib.tar' path_graph = base + '_deploy_graph.json' path_params = base + '_deploy_params.params' graph = open(path_graph).read() lib = tvm.runtime.load_module(path_lib) params = bytearray(open(path_params, 'rb').read()) rt_mod = grt.create(graph, lib, ctx=tvm.cpu(0)) rt_mod.load_params(params) rt_mod.set_input(data0=inputs, data1=token_types, data2=valid_length) if debug: rt_mod.run() if debug: rt_mod.run() else: ftimer = rt_mod.module.time_evaluator("run", ctx=tvm.cpu(0), number=1, repeat=10) res = ftimer().results print(np.mean(res) * 1000, np.var(res))
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.set_input(x=a) #verify dumproot created directory = mod._dump_path assert(os.path.exists(directory)) #verify graph is there GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json' assert(len(os.listdir(directory)) == 1) #verify the file name is proper assert(os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME))) mod.run() #Verify the tensors are dumped assert(len(os.listdir(directory)) > 1) CHROME_TRACE_FILE_NAME = '_tvmdbg_execution_trace.json' assert(os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME))) with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f: trace = json.load(f) assert trace["displayTimeUnit"] == "ns" events = trace["traceEvents"] assert len(events) == 4 assert all(event["ph"] in ('B', 'E') for event in events) assert all(event["pid"] == 1 for event in events) assert all(event["tid"] == 1 for event in events) assert all(event["name"] == 'x' for event in events[:2]) assert all(event["name"] == 'add' for event in events[2:]) assert events[0]["ts"] == 0 assert events[0]["ph"] == 'B' #verify the output is correct out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1) #test individual run mod.run_individual(20, 2, 1) mod.exit() #verify dump root delete after cleanup assert(not os.path.exists(directory))
def create_graph_module(args, graph, lib, ctx): graph_module = None if args.debug: try: graph_module = debug_runtime.create(graph, lib, ctx) except Exception: print('debug_runtime is disabled. ' 'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM') if graph_module is None: graph_module = graph_runtime.create(graph, lib, ctx) return graph_module
def relay_micro_build(func, dev_config, target, params=None, lib_headers=None, lib_include_paths=None): """Create a graph runtime module with a micro device context from a Relay function. Parameters ---------- func : relay.Function function to compile dev_config : TODO TODO target : TODO TODO params : dict input parameters that do not change during inference lib_headers : TODO TODO lib_include_paths : TODO TODO Return ------ mod : tvm.module.Module graph runtime module for the target device """ with tvm.target.build_config(opt_level=3, disable_vectorize=True): graph, c_mod, params = relay.build(func, target=target, params=params) micro_mod = micro.create_micro_mod(c_mod, dev_config, lib_headers=lib_headers, lib_include_paths=lib_include_paths) ctx = tvm.micro_dev(0) if DEBUG_MODE: dump_root = f'{get_repo_root()}/debug/micro' mod = debug_runtime.create(graph, micro_mod, ctx, dump_root=dump_root) else: mod = graph_runtime.create(graph, micro_mod, ctx) mod.set_input(**params) return mod
def tvm_lstm(fuse, opt_level=0, rebuild=True, profile=False): name = 'lstm' if rebuild: graph, lib, params = tvm_compile(name, fuse, opt_level) else: graph, lib, params = tvm_load(get_tvm_model_name( name, fuse, opt_level)) ###################################################################### # Execute the portable graph on TVM # --------------------------------- # Now, we would like to reproduce the same forward computation using TVM. from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime if is_gpu: ctx = tvm.gpu() #tvm.cuda() else: ctx = tvm.cpu(0) dtype = 'float32' if profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx) # set inputs x = mx.nd.ones(shape=data_shape) m.set_input('data', tvm.nd.array(x.asnumpy().astype(dtype))) if rebuild: m.set_input(**params) else: m.load_params(params) # execute ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=100) prof_res = np.array( ftimer().results) * 1000 # multiply 1000 for converting to millisecond # m.run() # # get outputs # tvm_output = m.get_output(0) # print(tvm_output) if profile: m.run() if fuse: name = "tvm lstm" else: name = "tvm lstm_cell" print("%-20s %-19s (%s)" % ("%s opt=%d" % (name, opt_level), "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
def run_model_tvm(graph, lib, params, run_settings, model_name, tuning_records=None): """ Run TVM model. Apply tuning records if they exist. """ profile = run_settings['profile'] device = run_settings['device'] repeat = run_settings['repeat'] session = rpc.LocalSession() ctx = session.cpu() if device == "cpu" else session.gpu() is_tuned = True if tuning_records else False lib_name = "mod.so" temp = util.tempdir() lib_path = temp.relpath(lib_name) lib.export_library(lib_path) session.upload(lib_path) lib = session.load_module(lib_name) if profile: module = debug_runtime.create(graph, lib, ctx, dump_root=f"results/prof_{model_name}_tuned={is_tuned}") else: module = runtime.create(graph, lib, ctx) saved_params = relay.save_param_dict(params) module.load_params(saved_params) shape_dict, dtype_dict = get_input_info(graph, params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict) module.set_input(**inputs_dict) if profile: module.run() timer = module.module.time_evaluator("run", ctx, 1, repeat=repeat) prof_result = timer() times = prof_result.results header, stats = extract_profile_data(times) filename = f'results/stat_table_{model_name}_tuned={is_tuned}' with open(filename, 'w') as f: print("%s\n%s\n" % (header, stats), filename, file=f) print("%s\n%s\n" % (header, stats))
def check_remote(): mlib = tvm.build(s, [A, B], "llvm", name="myadd") server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = util.tempdir() ctx = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") try: mod = graph_runtime.create(graph, mlib, remote.cpu(0)) except ValueError: print("Skip because debug graph_runtime not enabled") return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) out = tvm.nd.empty((n,), ctx=ctx) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1)
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") try: mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n, )).astype(A.dtype) mod.set_input(x=a) #verify dumproot created path = mod.ui_obj.curses_obj._dump_root + mod.ui_obj.curses_obj.dump_folder( ) directory = os.path.dirname(path) assert (os.path.exists(directory)) #verify graph is there assert (len(os.listdir(directory)) > 0) #verify dump root delete after cleanup mod.ui_obj.curses_obj.exit() assert (not os.path.exists(directory))
def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = util.tempdir() ctx = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") try: mod = graph_runtime.create(graph, mlib, remote.cpu(0)) except ValueError: print("Skip because debug graph_runtime not enabled") return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) out = tvm.nd.empty((n,), ctx=ctx) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1)
def run_single_model(self, model_name, model): inputs = self.input_data[model_name] print(self.input_names) shape_dict = {self.input_names[model_name]: inputs.shape} print(self.input_names[model_name], inputs.shape) print(shape_dict) print(type(model)) if self.file_types[model_name] == 'onnx': syms, params = relay.frontend.from_onnx(model, shape_dict) elif self.file_types[model_name] == 'keras': syms, params = relay.frontend.from_keras(model, shape_dict) elif self.file_types[model_name] == 'pytorch': shapes_list = list = [(k, v) for k, v in shape_dict.items()] syms, params = relay.frontend.from_pytorch(model, shapes_list) target = self.host.target target_host = self.host.target_host if self.sparse_cnn: print('converting sparsity') syms, params = ddo.simplify_fc_transpose.convert( syms["main"], params) syms, params = ddo.csr_conv2d.convert(syms, params, sparsity_threshold=0.0) with relay.build_config( opt_level=self.opt_level, # disabled_pass=[ # "FoldConstant", # ], # required_pass=[ # "SimplifyInference", # "OpFusion", # # "FoldConstant", # "FoldScaleAxis", # "AlterOpLayout", # "CanonicalizeOps", # "CanonicalizeCast", # "EliminateCommonSubexpr", # "CombineParallelConv2D", # "CombineParallelDense", # "CombineParallelBatchMatmul", # "FastMath" # ] ): graph, lib, params = relay.build_module.build( syms, target, params=params, target_host=target_host) # After `relay.build`, you will get three return values: graph, # library and the new parameter, since we do some optimization that will # change the parameters but keep the result of model as the same. # Save the library at local temporary directory. tmp = utils.tempdir() tarname = str(model_name) + '_' + str(self.host.name) + '.tar' lib_fname = tmp.relpath(tarname) lib.export_library(lib_fname) # obtain an RPC session from remote device. remote = self.host.session # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module(tarname) # create the remote runtime module ctx = self.host.ctx if not self.layer_debug: module = runtime.create(graph, rlib, ctx) else: module = debug_runtime.create(graph, rlib, ctx, dump_root=self.layer_output_dir) # set parameter (upload params to the remote device. # This may take a while) module.set_input(**params) # set input data module.set_input(key=self.input_names[model_name], value=inputs) # run print('running bruh') module.run() # get output out = module.get_output(0) self.outputs[model_name] = out.asnumpy() # get median inference time # sample mean is skewed in this setting by potentially unbounded # high outliers, thus we get the median print('running avg bruh') start = time.time() f = module.module.time_evaluator('run', ctx, number=10, repeat=self.runs) results = f().results median = np.median(results) * 1000 mean = np.mean(results) * 1000 self.inf_time[model_name] = median self.med_time[model_name] = median self.mean_time[model_name] = mean self.std_time[model_name] = np.std(results) * 1000 self.exper_time[model_name] = (time.time() - start) * 1000 print('ran a model bruh')
def main(): resnetv1 = onnx.load('models/resnet18v1.onnx') input_blob = resnetv1.graph.input[0] input_shape = tuple( map(lambda x: getattr(x, 'dim_value'), input_blob.type.tensor_type.shape.dim)) shape_dict = {input_blob.name: input_shape} mod_resnetv1, params_resnetv1 = relay.frontend.from_onnx( resnetv1, shape_dict) # resnetv2 = onnx.load('models/resnet18v2.onnx') # input_blob = resnetv2.graph.input[0] # input_shape = tuple(map(lambda x: getattr(x, 'dim_value'), input_blob.type.tensor_type.shape.dim)) # shape_dict = {input_blob.name: input_shape} # mod_resnetv2, params_resnetv2 = relay.frontend.from_onnx(resnetv2, shape_dict) mod_q_resnetv1 = quantize(mod_resnetv1, params_resnetv1) # mod_q_resnetv2 = quantize(mod_resnetv2, params_resnetv2) # mod_resnetv1['main'] = bind_params(mod_resnetv1['main'], params_resnetv1) # f = open('graphs/resnetv1_q.log.new', 'w+') # f.write(str(mod_q_resnetv1)) # f.close() # f = open('graphs/resnetv2_q.log', 'w+') # f.write(str(mod_q_resnetv2)) # f.close() # run_inference(mod_resnetv1) # run_inference(mod_q_resnetv1) # run_inference(mod_q_resnetv2) with autotvm.apply_history_best(log_file): #print("Compile...") #with relay.build_config(opt_level=3): #graph, lib, params = relay.build_module.build( #mod_q_resnetv1, target=target, params=params_resnetv1) #export library #tmp = tempdir() #filename = "net.tar" #lib.export_library(tmp.relpath(filename)) # load parameters #ctx = tvm.context(str(target), 0) #module = runtime.create(graph, lib, ctx) #module.set_input(**params) #val_data = get_val_data() #top1_correct = 0 #top5_correct = 0 #total = 0 #import time #start = time.process_time() #for i, batch in enumerate(val_data): # data, categories = batch['data'], batch['label'] # module.set_input('data', data) # module.run() # prediction = module.get_output(0).asnumpy() # top1_correct += (prediction.argmax(1) == categories).sum() # top5_correct += sum(map(lambda x: x[0] in x[1], zip(categories, prediction.argsort()[:, -5:]))) # total += len(data) # print(prediction) # print('Top1 Acc: {}, {}/{}'.format(float(top1_correct) / total, top1_correct, total)) # print('Top5 Acc: {}, {}/{}'.format(float(top5_correct) / total, top5_correct, total)) #end = time.process_time() #print('Time: {}'.format(end - start)) #print('Top1 Acc: {}, {}/{}'.format(float(top1_correct) / total, top1_correct, total)) #print('Top5 Acc: {}, {}/{}'.format(float(top5_correct) / total, top5_correct, total)) # evaluate #print("Evaluate inference time cost...") #ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) #prof_res = np.array(ftimer().results) * 1000 # convert to millisecond #print("Mean inference time (std dev): %.2f ms (%.2f ms)" % # (np.mean(prof_res), np.std(prof_res))) graph, mod, params = relay.build_module.build(mod_q_resnetv1['main'], target=target, params=params_resnetv1) val_data = get_val_data() for i, batch in enumerate(val_data): if i > 0: break data, categories = batch['data'], batch['label'] m = debug_runtime.create(graph, mod, ctx, dump_root='tvmdbg') m.set_input('data', tvm.nd.array(data.astype('float32'))) m.run() tvm_out = m.get_output(0, tvm.nd.empty(tuple([1, 1000]), 'float32')).asnumpy()
# -------------------------------------------------------------------- test_image = 'dog.jpg' print("Loading the test image...") img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \ test_image +'?raw=true' download(img_url, test_image) data = nnvm.testing.darknet.load_image(test_image, net.w, net.h) ###################################################################### # Execute on TVM Runtime # ---------------------- # The process is no different from other examples. #from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime as graph_runtime m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input('data', tvm.nd.array(data.astype(dtype))) m.set_input(**params) # execute print("Running the test image...") m.run() # get outputs out_shape = (net.outputs, ) tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy() #do the detection and bring up the bounding boxes thresh = 0.24 hier_thresh = 0.5
global timing if bool(is_before): timing = time.time() else: print('Executes: ', info.name, (time.time() - timing) * 1000) result = 1e9 target = -1 from tensorizer import tune tune.cpu_idx = 0 while True: with tvm.transform.PassContext(opt_level=3, trace=tracer, config={'tir.add_lower_pass': [(1, tensorizer.rewrite)]}): graph, lib, params = tvm.relay.build(module, target='llvm -mcpu=cascadelake') #from tvm.contrib import graph_runtime as runtime from tvm.contrib.debugger import debug_runtime as runtime func = runtime.create(graph, lib, tvm.cpu()) x_ = (np.random.randn(n, c, h, w) * 128).astype('int8') func.set_input('x', x_) timer = func.module.time_evaluator('run', ctx=tvm.cpu(0), number=3, repeat=10) timed = timer() if timed.mean < result: result = timed.mean target = tune.cpu_idx relay.backend.compile_engine.get().clear() tune.cpu_idx += 1 if tune.cpu_idx - target > 8: break
deploy_graph_path = os.path.join(saved_dir, "deploy_graph.json") loaded_graph = open(deploy_graph_path).read() lib_save_path = os.path.join(saved_dir, "deploy_lib.tar") loaded_lib = tvm.runtime.load_module(lib_save_path) deploy_param_file_path = os.path.join(saved_dir, "deploy_param.params") loaded_params = bytearray(open(deploy_param_file_path, "rb").read()) # target = tvm.target.cuda() target = params_dict['target'] ctx = tvm.context(str(target), 0) print("testing and evaluating TVM performance") dtype = params_dict['dtype'] module = graph_runtime.create(loaded_graph, loaded_lib, ctx) print(dtype) # set inputs inference_input_shapes = params_dict['inference_input_shapes'] inference_input_names = params_dict['inference_input_names'] for i in range(len(inference_input_shapes)): input_shape = tuple(inference_input_shapes[i]) input_name = inference_input_names[i] print("input name:" + input_name) print("input shape: " + str(input_shape)) temp_data = np.random.uniform(size=input_shape).astype(dtype) data_tvm = tvm.nd.array(temp_data) module.set_input(input_name, data_tvm)
mod["main"] = func # print("print non-tuning tvm op: ") scale = 1 data = np.random.uniform(-scale, scale, size=dshape).astype(dtype) weight = np.random.uniform(-scale, scale, size=kshape).astype(dtype) data = tvm.nd.array(data, ctx) weight = tvm.nd.array(weight, ctx) with tvm.transform.PassContext(opt_level=3): print("Compiling...") graph, lib, params = tvm.relay.build(mod, target=target) from tvm.contrib.debugger import debug_runtime as graph_runtime module = graph_runtime.create(graph, lib, ctx) module.set_input("x", data) module.set_input("w", weight) module.set_input(**params) print("testing non-tuning result of the op......") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) print("\n") print("now testing tuned op......") with autotvm.apply_history_best("batch_matmul_cuda.log"): with tvm.transform.PassContext(opt_level=3): print("Compiling...") graph, lib, params = tvm.relay.build(mod, target=target)
def check_verify(): mlib = tvm.build(s, [A, B], "llvm", name="myadd") def myadd(*args): to_return = mlib["myadd"](*args) time.sleep(0.25) return to_return mlib_proxy = tvm.support.FrontendTestModule() mlib_proxy["myadd"] = myadd try: mod = debug_runtime.create(graph, mlib_proxy, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n, )).astype(A.dtype) mod.set_input(x=a) # verify dumproot created directory = mod._dump_path assert os.path.exists(directory) # verify graph is there GRAPH_DUMP_FILE_NAME = "_tvmdbg_graph_dump.json" assert len(os.listdir(directory)) == 1 # verify the file name is proper graph_dump_path = os.path.join(directory, GRAPH_DUMP_FILE_NAME) assert os.path.exists(graph_dump_path) # verify the graph contains some expected keys with open(graph_dump_path) as graph_f: dumped_graph = json.load(graph_f) assert isinstance(dumped_graph, dict) for k in ("nodes", "arg_nodes", "node_row_ptr", "heads", "attrs"): assert k in dumped_graph, f"key {k} not in dumped graph {graph!r}" mod.run() # Verify the tensors are dumped assert len(os.listdir(directory)) > 1 debug_lines = mod.debug_datum.get_debug_result().split("\n") def split_debug_line(i): to_return = re.split(r" [ ]*", debug_lines[i]) assert to_return[-1] == "" to_return = to_return[:-1] # strip empty trailing part return to_return assert split_debug_line(0) == [ "Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs", ] myadd_lines = split_debug_line(2) assert myadd_lines[0] == "add" assert myadd_lines[1] == "myadd" runtime_sec = float(myadd_lines[2]) / 1e6 # printed in us # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude. # Here we just care that the prefix is correct. assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000 total_lines = split_debug_line(3) assert total_lines[0] == "Total_time" assert total_lines[2] == myadd_lines[2] CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json" assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME)) with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f: trace = json.load(f) assert trace["displayTimeUnit"] == "ns" events = trace["traceEvents"] assert len(events) == 4 assert all(event["ph"] in ("B", "E") for event in events) assert all(event["pid"] == 1 for event in events) assert all(event["tid"] == 1 for event in events) assert all(event["name"] == "x" for event in events[:2]) assert all(event["name"] == "add" for event in events[2:]) assert events[0]["ts"] == 0 assert events[0]["ph"] == "B" # verify the output is correct out = mod.get_output(0, tvm.nd.empty((n, ))) np.testing.assert_equal(out.asnumpy(), a + 1) mod.exit() # verify dump root delete after cleanup assert not os.path.exists(directory)
def run_module( module_file, hostname, port=9090, rpc_key=None, device=None, inputs_file=None, fill_mode="random", repeat=1, profile=False, ): """Run a compiled graph runtime module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- module_file : str The path to the module file (a .tar file). hostname : str The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. device: str, optional the device (e.g. "cpu" or "gpu") to be targeted by the RPC session, local or remote). inputs_file : str, optional Path to an .npz file containing the inputs. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ with tempfile.TemporaryDirectory() as tmp_dir: logger.debug("extracting module file %s", module_file) t = tarfile.open(module_file) t.extractall(tmp_dir) graph = open(os.path.join(tmp_dir, "mod.json")).read() params = bytearray( open(os.path.join(tmp_dir, "mod.params"), "rb").read()) if hostname: # Remote RPC if rpc_key: logger.debug("running on remote RPC tracker with key %s", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("running on remote RPC with no key") session = rpc.connect(hostname, port) else: # Local logger.debug("running a local session") session = rpc.LocalSession() session.upload(os.path.join(tmp_dir, "mod.so")) lib = session.load_module("mod.so") # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("device is %s", device) if device == "gpu": ctx = session.gpu() elif device == "cl": ctx = session.cl() else: assert device == "cpu" ctx = session.cpu() if profile: logger.debug("creating runtime with profiling enabled") module = debug_runtime.create(graph, lib, ctx, dump_root="./prof") else: logger.debug("creating runtime with profiling disabled") module = runtime.create(graph, lib, ctx) logger.debug("load params into the runtime module") module.load_params(params) shape_dict, dtype_dict = get_input_info(graph, params) inputs_dict = make_inputs_dict(inputs_file, shape_dict, dtype_dict, fill_mode) logger.debug("setting inputs to the module") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.debug("running the module with profiling enabled") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", ctx, 1, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("collecting the output tensors") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).asnumpy() return outputs, times
params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx) # Set the network parameters and synthetic input image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # Perform inference timer = m.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def run_tvm(data, symbol_file, num_inference_images, sym, devs, label_name): debug = False import tvm from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime as debug_runtime base = './compiled/' + symbol_file.split('/')[-1].replace('.json', '') path_lib = base + '_deploy_lib.tar' path_graph = base + '_deploy_graph.json' path_params = base + '_deploy_params.params' graph = open(path_graph).read() lib = tvm.runtime.load_module(path_lib) params = bytearray(open(path_params, 'rb').read()) if debug: rt_mod = debug_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod = mx.mod.Module(symbol=sym, context=devs) mod.bind(for_training=False, data_shapes=data.provide_data) else: rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod = mx.mod.Module(symbol=sym, context=devs, label_names=[ label_name, ]) mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=data.provide_label) rt_mod.load_params(params) mod.set_params(arg_params, aux_params) counter = 0 top_1_raw = 0 top_5_raw = 0 top_1_raw_mxnet = 0 top_5_raw_mxnet = 0 if debug: data = advance_data_iter(data, 0) for batch in data: # Get the original label. correct_label = int(batch.label[0].asnumpy()[0]) rt_mod.set_input('data', batch.data[0].asnumpy()) rt_mod.run() if debug: np.set_printoptions(suppress=False) for i in rt_mod.debug_datum.get_output_tensors().keys(): print(i, rt_mod.debug_get_output(i)) return tvm_res = rt_mod.get_output(0).asnumpy() mod.forward(batch, is_train=False) mxnet_res = mod.get_outputs()[0].asnumpy() if debug: print("######## MxNet ###########") print(mxnet_res[0][0]) print("######## TVM ###########") print(tvm_res[0][0]) print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("######## MxNet ###########") print(mxnet_res) print("######## TVM ###########") print(tvm_res) #print("######## Diff ###########") # it = np.nditer(mxnet_res, flags=['multi_index']) # while not it.finished: # print("%d <%s>" % (it[0], it.multi_index), end='\n') # it.iternext() np.testing.assert_allclose(mxnet_res.astype('int32'), tvm_res.astype('int32'), atol=0, verbose=True) try: np.testing.assert_allclose(mxnet_res.astype('int32'), tvm_res.astype('int32'), atol=0, verbose=True) except: np.testing.assert_allclose(mxnet_res.astype('int32'), tvm_res.astype('int32'), atol=1, verbose=True) else: tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1] mxnet_pred = np.squeeze(mxnet_res).argsort()[-5:][::-1] if correct_label == tvm_pred[0]: top_1_raw += 1 top_5_raw += 1 elif correct_label in tvm_pred: top_5_raw += 1 if correct_label == mxnet_pred[0]: top_1_raw_mxnet += 1 top_5_raw_mxnet += 1 elif correct_label in mxnet_pred: top_5_raw_mxnet += 1 counter += 1 if counter == num_inference_images: break model_name = symbol_file.split('/')[-1].replace('.json', '') top_1 = float(top_1_raw_mxnet) / float(counter) top_5 = float(top_5_raw_mxnet) / float(counter) print("Mxnet", model_name, top_1, top_5, sep='\t') top_1 = float(top_1_raw) / float(counter) top_5 = float(top_5_raw) / float(counter) print("Tvm", model_name, top_1, top_5, sep='\t')