def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)) # run tuning tasks tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) # compile kernels with graph-level best records with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(mod, params, input_shape, dtype, measure_top_n, target, tuning_opt, graph_log_file, best_results_file): """Tune a model with the ranking model and evaluate the performance.""" sys.stderr.write("Extract conv2d tasks...\n") tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params) # Run tuning tasks. if graph_log_file is not None and not os.path.exists(graph_log_file): best_results = tune_kernels(tasks, True, measure_top_n, **tuning_opt) tune_graph(mod["main"], input_shape[1], target, tuning_opt['log_filename'], graph_log_file) else: best_results = tune_kernels(tasks, False, measure_top_n, **tuning_opt) with open(best_results_file, 'w') as of: json.dump(best_results, of) dispatch_ctx = tvm.autotvm.task.DispatchContext.current if graph_log_file is not None and os.path.exists(graph_log_file): sys.stderr.write("Compile model with graph tuning...\n") tvm.autotvm.task.DispatchContext.current = autotvm.apply_graph_best( graph_log_file) elif os.path.exists(tuning_opt['log_filename']): sys.stderr.write("Compile model without graph tuning...\n") tvm.autotvm.task.DispatchContext.current = autotvm.apply_history_best( tuning_opt['log_filename']) else: sys.stderr.write("Compile model with fallback + tophub...\n") compile_engine.get().clear() with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(mod, target=target, params=params) tvm.autotvm.task.DispatchContext.current = dispatch_ctx # Load parameters. ctx = tvm.context(str(target), 0) module = runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape[1])).astype(dtype)) module.set_input(input_shape[0], data_tvm) module.set_input(**params) # Evaluate performance. sys.stderr.write("Evaluate inference time cost...\n") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond sys.stderr.write("Median inference time: %.2f ms\n" % np.median(prof_res))
def tune_and_evaluate(tuning_opt): # extract workloads from relay program onnx_model = onnx.load('facenet.onnx') print("Extract tasks...") input_name = 'input_input' input = np.random.rand(1, 3, 112, 112) shape_dict = {input_name: input.shape} mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) data_shape = (1, 3, 112, 112) output_shape = (1, 512) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"), )) print("run tuning tasks") # run tuning tasks tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) # compile kernels with graph-level best records with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(mod, target=target, params=params) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array( (np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) path_lib = "deploy/lib/facenet_tuneing_cpu.so" lib.export_library(path_lib) fo = open("facenet_tuneing_cpu.json", "w") fo.write(graph) fo.close() fo = open("facenet_cpu.params", "wb") fo.write(relay.save_param_dict(params)) fo.close()
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) # why "main"? tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=(relay.op.nn.conv2d, )) # # run tuning tasks print("Tuning...") tune_kernels(tasks, **tuning_opt) # compile kernels with graph-level best records tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(mod, target=target, params=params) # graph, lib, params = relay.build(mod, # target=target, # target_host=target, # params=params) base_path = './lib' path_lib = os.path.join(base_path, "deploy_lib.so") lib.export_library(path_lib) with open(os.path.join(base_path, "deploy_graph.json"), "w") as fo: fo.write(graph) with open(os.path.join(base_path, "deploy_param.params"), "wb") as fo: fo.write(relay.save_param_dict(params)) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array( (np.random.uniform(size=(1, 380, 380, 1))).astype(dtype)) # data_tvm = preprocess(img_path) module = runtime.create(graph, lib, ctx) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def benchmark(network, batch_size, dtype, target, log_prefix, repeat): layout = "NCHW" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) if use_graph_tuner(network, batch_size, dtype, target): log_file = log_prefix + ".graph.log" history_best_context = autotvm.apply_graph_best(log_file) else: log_file = log_prefix + ".kernel.log" history_best_context = autotvm.apply_history_best(log_file) assert os.path.exists( log_file), "The log file '%s' does not exist." % log_file print("Use log file %s" % log_file) if network in ["bert"]: # Build module with history_best_context: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data seq_length = input_shape[0][1] data = np.random.uniform(size=input_shape[0]) token_types = np.random.uniform(size=input_shape[1]) valid_length = np.array([seq_length] * batch_size) module.set_input(data0=data, data1=token_types, data2=valid_length) else: # Build module with history_best_context: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data data = np.random.uniform(size=input_shape) module.set_input(input_name, data) # Evaluate ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) return np.array(ftimer().results)
def benchmark(network, target, log_prefix): mod, params, input_shape, output_shape = get_network(network) # covert to NCHW desired_layouts = {'nn.conv2d': ['NCHW', 'default']} seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) if network in ["bert"]: log_file = log_prefix + "_kernel.log" with autotvm.apply_history_best(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device ctx = tvm.context(str(target), 0) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape[0])).astype(dtype)) token_types_tvm = tvm.nd.array(np.random.uniform(size=input_shape[1]).astype(dtype)) valid_length_tvm = tvm.nd.array(np.random.uniform(size=input_shape[2]).astype(dtype)) module = runtime.GraphModule(lib["default"](ctx)) module.set_input(data0=data_tvm, data1=token_types_tvm, data2=valid_length_tvm) else: log_file = log_prefix + "_graph.log" with autotvm.apply_graph_best(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device ctx = tvm.context(str(target), 0) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module = runtime.GraphModule(lib["default"](ctx)) module.set_input(args.inputname, data_tvm) # evaluate print("Evaluate...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat) prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond print( "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)) )
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(batch_size) # tasks = autotvm.task.extract_from_program(mod["main"], target=target, # params=params, ops=(relay.op.nn.conv2d,)) # # # run tuning tasks # print("Tuning...") # tune_kernels(tasks, **tuning_opt) # tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file, use_DP=False) # compile kernels with graph-level best records with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with relay.build_config(opt_level=1): graph, lib, params = relay.build_module.build( mod, target=target, params=params) temp = util.tempdir() path_lib = temp.relpath(lib_dir) lib.export_library(path_lib) with open(temp.relpath(graph_dir), "w") as fo: fo.write(graph) with open(temp.relpath(params_dir), "wb") as fo: fo.write(relay.save_param_dict(params)) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"), )) # run tuning tasks tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) # compile kernels in default mode print( "Evaluation of the network compiled in 'default' mode without auto tune:" ) with tvm.transform.PassContext(opt_level=3): print("Compile...") lib = relay.build(mod, target=target, params=params) evaluate_performance(lib, data_shape) # compile kernels in kernel tuned only mode print("\nEvaluation of the network been tuned on kernel level:") with autotvm.apply_history_best(log_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) evaluate_performance(lib, data_shape) # compile kernels with graph-level best records print("\nEvaluation of the network been tuned on graph level:") with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) evaluate_performance(lib, data_shape)
# dynamic to static(maybe useful) mod = relay.transform.DynamicToStatic()(mod) print(mod.astext(show_meta_data=False)) # print("Extract tasks...") # tasks = autotvm.task.extract_from_program( # mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),) # ) # 例程上,target = "llvm" # for i, task in enumerate(tasks): # print(len(task.config_space)) # tune_kernels(tasks, **tuning_option) # tuning # tune_graph(mod["main"], data_shape, 'unet_cpu_2_thread.log', graph_opt_sch_file,exec_num = 1000) # tuning # # #只需要得到这个opt_sch_file就可 with autotvm.apply_graph_best(graph_opt_sch_file): # graph_opt_sch_file print("compile...") with tvm.transform.PassContext(opt_level=3): # set < 3 # lib = relay.build_module.build(mod,target,params = params) lib = relay.build(mod, target, params=params) # m = graph_executor.GraphModule(lib["default"](dev)) with open(graph_opt_sch_file, 'r') as f: graph = f.read() m = graph_executor.create(graph, lib['default'], dev, dump_root="/tmp/tvmdbg") # set input and get_output m.set_input(input_name, tvm.nd.array(x.astype(dtype))) # input_name = 'x' # must set 'x' as input here due to previous channel translating
def run_timing(device, platform, model, remote=None, autotvm_log=None, batch=1, runs=3, reps=5, log=None): """ Run a time trail on TVM :param device: The device to run this on :param platform: The platform get the machine learning model on :param model: The machine learning model to use :param remote: Details about the remote device :param autotvm_log: The path to the auto TVM file :param batch: The number of pictures to run in one go :param runs: The number of runs to run the picture through :param reps: The number of times the measurement should be repeated :param log: The output file """ # Output details of run from cpuinfo import get_cpu_info from datetime import datetime print("\n──────────────────────────── TVMUI ────────────────────────────\n") log.write("TVM Time Trial\n") log_print(log, "Started on " + str(datetime.now().strftime("%m/%d/%Y at %H:%M:%S"))) if remote is None: log_print(log, 'Hardware: ' + device) if device == 'x86': log_print(log, 'CPU Type: ' + get_cpu_info().get('brand_raw')) else: log_print(log, 'Remote Name: ' + remote["name"]) log_print(log, 'Remote Device: ' + remote["type"]) log_print(log, 'Remote Hardware: ' + remote["hardware"]) log_print(log, 'Backend: ' + platform) log_print(log, 'Model: ' + model) log_print(log, str(batch) + " picture(s) per run") log_print(log, str(runs) + " run average, repeated " + str(reps) + " times.") if autotvm_log is None: log_print(log, 'AutoTVM: No\n') else: log_print(log, 'AutoTVM: Yes\n') # Get the model and image data import numpy as np from PIL import Image from tvm import relay import tvm from tvm.contrib.download import download_testdata print("Loading models and images...") pictures = get_pics(batch) dataset = [] if platform == "MXNet": from mxnet.gluon.model_zoo.vision import get_model block = get_model(model, pretrained=True) synset_url = "".join( [ "https://gist.githubusercontent.com/zhreshold/", "4d0b62f3d01426887599d4f7ede23ee5/raw/", "596b27d23537e5a1b5751d2b0481ef172f58b539/", "imagenet1000_clsid_to_human.txt", ] ) synset_name = "imagenet1000_clsid_to_human.txt" synset_path = download_testdata(synset_url, synset_name, module="data") with open(synset_path) as f: synset = eval(f.read()) def transform_image(image): image = np.array(image) - np.array([123.0, 117.0, 104.0]) image /= np.array([58.395, 57.12, 57.375]) image = image.transpose((2, 0, 1)) image = image[np.newaxis, :] return image if model == 'resnet18_v1' or model == 'mobilenetv2_1.0': for img in pictures: dataset.append(transform_image(Image.open(img).resize((224, 224)))) input_shape = [batch, 3, 224, 224] elif model == 'inceptionv3': for img in pictures: dataset.append(transform_image(Image.open(img).resize((299, 299)))) input_shape = [batch, 3, 299, 299] else: raise Exception("Invalid Model") shape_dict = {"data": input_shape} mod, params = relay.frontend.from_mxnet(block, shape_dict) func = mod["main"] func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) elif platform == "PyTorch": import torch import torchvision model = getattr(torchvision.models, model)(pretrained=True) model = model.eval() # We grab the TorchScripted model via tracing input_shape = [batch, 3, 224, 224] input_data = torch.randn(input_shape) scripted_model = torch.jit.trace(model, input_data).eval() synset_url = "".join( [ "https://raw.githubusercontent.com/Cadene/", "pretrained-models.pytorch/master/data/", "imagenet_synsets.txt", ] ) synset_name = "imagenet_synsets.txt" synset_path = download_testdata(synset_url, synset_name, module="data") with open(synset_path) as f: synsets = f.readlines() synsets = [x.strip() for x in synsets] splits = [line.split(" ") for line in synsets] key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits} class_url = "".join( [ "https://raw.githubusercontent.com/Cadene/", "pretrained-models.pytorch/master/data/", "imagenet_classes.txt", ] ) class_name = "imagenet_classes.txt" class_path = download_testdata(class_url, class_name, module="data") with open(class_path) as f: class_id_to_key = f.readlines() class_id_to_key = [x.strip() for x in class_id_to_key] def transform_image(image): from torchvision import transforms my_preprocess = transforms.Compose( [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) img = my_preprocess(image) return np.expand_dims(img, 0) for img in pictures: dataset.append(transform_image(Image.open(img).resize((224, 224)))) input_name = "data" shape_list = [(input_name, input_shape)] func, params = relay.frontend.from_pytorch(scripted_model, shape_list) elif platform == "TensorFlow": import tensorflow as tf import os try: tf_compat_v1 = tf.compat.v1 except ImportError: tf_compat_v1 = tf import tvm.relay.testing.tf as tf_testing # Base location for model related files. repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/" model_name = "classify_image_graph_def-with_shapes.pb" model_url = os.path.join(repo_base, model_name) # Image label map map_proto = "imagenet_2012_challenge_label_map_proto.pbtxt" map_proto_url = os.path.join(repo_base, map_proto) # Human readable text for labels label_map = "imagenet_synset_to_human_label_map.txt" label_map_url = os.path.join(repo_base, label_map) model_path = download_testdata(model_url, model_name, module=["tf", "InceptionV1"]) map_proto_path = download_testdata(map_proto_url, map_proto, module="data") label_path = download_testdata(label_map_url, label_map, module="data") with tf_compat_v1.gfile.GFile(model_path, "rb") as f: graph_def = tf_compat_v1.GraphDef() graph_def.ParseFromString(f.read()) graph = tf.import_graph_def(graph_def, name="") # Call the utility to import the graph definition into default graph. graph_def = tf_testing.ProcessGraphDefParam(graph_def) # Add shapes to the graph. with tf_compat_v1.Session() as sess: graph_def = tf_testing.AddShapesToGraphDef(sess, "softmax") for img in pictures: dataset.append(np.array(Image.open(img).resize((299, 299)))) shape_dict = {"data": [batch, 3, 299, 299]} dtype_dict = {"DecodeJpeg/contents": "uint8"} mod, params = relay.frontend.from_tensorflow(graph_def, layout=None, shape=shape_dict) else: raise Exception('Not Supported!') # Build the graph if device == 'x86': target = "llvm" ctx = tvm.cpu(0) log_print(log, 'Target: ' + target) elif device == 'Metal': target = "metal" ctx = tvm.metal(0) log_print(log, 'Target: ' + target) elif device == 'arm_cpu': target = tvm.target.arm_cpu(remote["type"]) ctx = tvm.cpu(0) log_print(log, 'Target: ' + remote["type"]) else: target = device ctx = tvm.cpu(0) log_print(log, 'Target: ' + device) log_print(log, 'Actual Model: ' + model + '\n') print('Making the graph...') if autotvm_log is not None: from tvm import autotvm log_print(log, 'Using AutoTVM file ' + autotvm_log) with autotvm.apply_graph_best(autotvm_log): with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, target, params=params) else: with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, target, params=params) print("\nSetting up TVM...") from tvm.contrib import graph_runtime # Remote upload if remote is not None: from tvm import rpc from tvm.contrib import utils, graph_runtime as runtime print("Exporting graph...") tmp = utils.tempdir() lib_fname = tmp.relpath("net.tar") lib.export_library(lib_fname) print("Connecting to device...") remote = rpc.connect(str(remote["ip"]), int(remote["port"])) print("Uploading to device...") remote.upload(lib_fname) lib = remote.load_module("net.tar") if device == 'x86': ctx = remote.cpu(0) elif device == 'Metal': ctx = remote.metal(0) elif device == 'arm_cpu': ctx = remote.cpu(0) else: ctx = remote.cpu(0) dtype = "float32" m = graph_runtime.GraphModule(lib["default"](ctx)) def run_tvm(pics, number, repeat): """ Runs a single inference and gives back the time :param pics: The images(s) to run :param number: The number of times to run the inference :param repeat: The number of times to repeat the measurement :return: An array with the time and the result """ # combine pictures arr = np.ndarray(shape=input_shape, dtype=dtype) p = 0 for ip in pics: arr[p] = ip.astype(dtype) p = p + 1 m.set_input("data", tvm.nd.array(arr)) #Actually run inference time = m.module.time_evaluator("run", ctx, number=number, repeat=repeat)() #Get output res = [] if platform == 'MXNet': for i in range(len(pics)): res.append(synset[np.argmax(m.get_output(0).asnumpy()[i])]) if platform == 'PyTorch': # Get top-1 result for TVM for i in range(len(pics)): top1_tvm = np.argmax(m.get_output(0).asnumpy()[i]) tvm_class_key = class_id_to_key[top1_tvm] res.append(key_to_classname[tvm_class_key]) if platform == 'TensorFlow': pre = np.squeeze(m.get_output(0, tvm.nd.empty(((1, 1008)), "float32")).asnumpy()) node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path, uid_lookup_path=label_path) top_k = pre.argsort()[-5:][::-1] res = node_lookup.id_to_string(top_k[0]) return [time, res] # Run the inferences output = [] total = 0 print("\nRunning inferences...") for i in range(int(len(dataset) / batch)): log_print(log, "\nSet " + str(i + 1) + ":") inp = [] # Create the next batch for j in range(batch): inp.append(dataset[batch * i + j]) # Run inference here output = run_tvm(inp, runs, reps) # Output results e = 0 for rl in output[1]: log_print(log, "Image " + str(e + 1) + " Path: " + pictures[batch * i + e]) log_print(log, "Image " + str(e + 1) + " ID: " + rl) e = e + 1 log_print(log, "Time taken: " + str('%.2f' % (1000 * output[0].mean)) + " ms") total = total + output[0].mean ave = total / int(len(dataset) / batch) log_print(log, '\nAVERAGE TIME: ' + str(ave * 1000) + " ms") log_print(log, "Finished on " + str(datetime.now().strftime("%m/%d/%Y at %H:%M:%S"))) log.close() return