def quantize_relay_module(mod, params, qconfig=None): """ Quantize the relay module with qconfig options. Parameters: ------ mod : tvm.relay.module The original module. qconfig : tvm.relay.quantize.quantize.QConfig The quantization configuration Returns: ------ qfunc : vm.relay.expr.Function The graph after quantization """ # default qconfig if not qconfig: qconfig = qtz.qconfig() with qconfig: logging.debug('current quantize config') logging.debug(qtz.current_qconfig()) mod['main'] = qtz.quantize(mod['main'], params=params) logging.debug('after quantize') logging.debug(mod['main'].astext(show_meta_data=False)) return mod
def test_onnx_quantize_acc(cfg, rec_val, batch_size=1, original=False): qconfig = qtz.qconfig( skip_conv_layers=[0], skip_dense_layer=False, nbit_input=cfg.nbit_input, nbit_weight=cfg.nbit_input, dtype_input=cfg.dtype_input, dtype_weight=cfg.dtype_input, dtype_activation=cfg.dtype_output, debug_enabled_ops=None, calibrate_mode="percentile", calibrate_chunk_by=8, ) dataset = list(calibrate_dataset(cfg.model, rec_val, batch_size, 64)) model, logfile = get_onnx_model(cfg.model, batch_size, qconfig, tvm.target.cuda(), original=original, dataset=dataset) val_data, batch_fn = get_val_data(cfg.model, rec_val=rec_val, batch_size=batch_size) with tvm.autotvm.apply_history_best(logfile): acc = eval_acc(model, val_data, batch_fn, log_interval=1000) assert acc > cfg.expected_acc return acc
def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file): qconfig = qtz.qconfig(skip_conv_layers=[0], nbit_input=cfg.nbit_input, nbit_weight=cfg.nbit_input, global_scale=cfg.global_scale, dtype_input=cfg.dtype_input, dtype_weight=cfg.dtype_input, dtype_activation=cfg.dtype_output, debug_enabled_ops=None) # extract workloads from relay program logging.info("Extract tasks...") mod, params, input_shape = get_model(cfg.model, cfg.batch_size, qconfig, target) tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(relay.op.nn.conv2d,)) for i in range(len(tasks)): op_name = tasks[i].workload[0] if op_name == 'conv2d': func_create = 'topi_x86_conv2d_NCHWc' elif op_name == 'depthwise_conv2d_nchw': func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw' else: print ("Tuning {} is not supported on x86") raise ValueError("Tuning {} is not supported on x86".format(op_name)) print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks), tasks[i].name, tasks[i].workload[0])) tsk = autotvm.task.create(func_create, args=tasks[i].args, target=tasks[i].target, template_key='direct') tsk.workload = tasks[i].workload tasks[i] = tsk # run tuning tasks logging.info("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records # with autotvm.apply_history_best(log_file): logging.info("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params) # export library tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) # load parameters module = tvm.contrib.graph_runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32')) module.set_input('data', data_tvm) module.set_input(**params) # evaluate logging.info("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_quantize_pass(): def quantize_weight(arr): maximum = np.amax(np.abs(arr.asnumpy())) scale = 2**math.ceil(math.log(maximum, 2)) out = np.around(arr.asnumpy() / scale * 128).astype('int8') out = np.clip(out, -127, 127) return relay.const(out, 'int8') n, c, h, w = 1, 3, 224, 224 def make_graph(data): weight = relay.var("conv_weight") out = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=c) out = relay.Function(relay.ir_pass.free_vars(out), out) return out def make_qgraph(data, weight): out = data * relay.const(32.0) out = relay.round(out) out = relay.clip(out, a_min=-127, a_max=127) out = out.astype('int8') out = relay.nn.conv2d(out, weight, kernel_size=(3, 3), padding=(1, 1), channels=c, out_dtype='int32') out = out.astype('float32') out = relay.multiply(out, relay.const(0.00024414062)) out = relay.Function(relay.ir_pass.free_vars(out), out) return out np.random.seed(42) data = relay.var("data", relay.TensorType((n, c, h, w), "float32")) graph = make_graph(data) dataset, params = make_dataset(graph, 10) with qtz.qconfig(skip_k_conv=0, global_scale=4.0, round_for_shift=False, store_lowbit_output=False): qgraph0 = qtz.quantize(graph, params) qgraph0 = relay.ir_pass.infer_type(qgraph0) conv_weight = quantize_weight(params['conv_weight']) qgraph1 = make_qgraph(data, conv_weight) qgraph1 = relay.ir_pass.infer_type(qgraph1) graph = relay.create_executor('graph') res0 = graph.evaluate(qgraph0)(dataset[0]['data']) res1 = graph.evaluate(qgraph1)(dataset[0]['data']) tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
def build_model(args, gluon_model): """Build with relay.""" import tvm from tvm import relay from tvm.relay import quantize as qtz img_size = 299 if args.model == 'inceptionv3' else 224 data_shape = (args.batch_size, 3, img_size, img_size) net, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape}) target = args.target if args.original: # run original model with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, target, params=params) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx # constant folding and scale folding. print('original') print(net.astext(show_meta_data=False)) with relay.build_config(opt_level=3): qgraph = relay.optimize(net, target, params) # qgraph = relay.optimize(qgraph) print('after optimize') print(qgraph.astext(show_meta_data=False)) with qtz.qconfig(skip_k_conv=0, nbit_input=args.nbit_input, nbit_weight=args.nbit_input, global_scale=args.global_scale, dtype_input=args.dtype_input, dtype_weight=args.dtype_input, dtype_activation=args.dtype_output, store_lowbit_output=False, debug_enabled_ops=None): print(qtz.current_qconfig()) qgraph = qtz.annotate(qgraph) print('after annotate') print(qgraph.astext(show_meta_data=False)) qgraph = qtz.calibrate(qgraph) print('after calibrate\n') print(qgraph.astext(show_meta_data=False)) if not args.simulated: qgraph = qtz.realize(qgraph) qgraph = relay.ir_pass.infer_type(qgraph) print('after realize\n') print(qgraph.astext(show_meta_data=False)) with relay.build_config(opt_level=3): graph, lib, params = relay.build(qgraph, target) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx
def test_quantize_acc(cfg, rec_val): qconfig = qtz.qconfig(skip_conv_layers=[0], nbit_input=cfg.nbit_input, nbit_weight=cfg.nbit_input, global_scale=cfg.global_scale, dtype_input=cfg.dtype_input, dtype_weight=cfg.dtype_input, dtype_activation=cfg.dtype_output, debug_enabled_ops=None) model = get_model(cfg.model, 32, qconfig, tvm.target.cuda()) val_data, batch_fn = get_val_data(cfg.model, rec_val=rec_val, batch_size=32) acc = eval_acc(model, val_data, batch_fn) assert acc > cfg.expected_acc return acc
if __name__ == '__main__': tf_Inception_v1_path = '/home/terse/code/programming/blog/TVM_quantization/tf/InceptionV1/classify_image_graph_def-with_shapes.pb' mod, params, input_shape = get_tf_model_InceptionV1(tf_Inception_v1_path) logging.info(mod['main'].astext(show_meta_data=False)) ctx = tvm.cpu() target = 'llvm -mcpu=core-avx2' # Configure the quantization behavior qconfig = qtz.qconfig(skip_conv_layers=[0], nbit_input=8, nbit_weight=8, global_scale=8.0, dtype_input='int8', dtype_weight='int8', dtype_activation='int8', debug_enabled_ops=None) # mod['main'] = qtz.prerequisite_optimize(mod['main'],params=params) # logging.info(mod['main'].astext(show_meta_data=False)) mod = quantize_relay_module(mod, params, qconfig) # autotvm_tune(mod['main'], params, target) # graph,lib,params = build_module(mod, params, target,'tuning_inceptv1.log') graph, lib, params = build_module(mod, params, target) save_compiled_module(graph, lib, params, "model_inception")
shape = {'data': data.shape} dtype_dict = {} # convert nnvm to relay print("convert nnvm symbols into relay function...") from nnvm.to_relay import to_relay func, params = to_relay(sym, shape, 'float32', params=params) # optimization print("optimize relay graph...") with tvm.relay.build_config(opt_level=2): func = tvm.relay.optimize(func, target, params) # quantize print("apply quantization...") from tvm.relay import quantize with quantize.qconfig(): func = quantize.quantize(func, params) # Relay build print("Compiling the model...") print(func.astext(show_meta_data=False)) with tvm.relay.build_config(opt_level=3): graph, lib, params = tvm.relay.build(func, target=target, params=params) # Save the model tmp = util.tempdir() lib_fname = tmp.relpath('model.tar') lib.export_library(lib_fname) # NNVM # with nnvm.compiler.build_config(opt_level=2):
def quantize_model(args): """Build with relay.""" import tvm from tvm import relay from tvm.relay import quantize as qtz img_size = 224 data_shape = (args.batch_size, 3, img_size, img_size) mx_sym, mx_args, mx_auxs = mx.model.load_checkpoint(args.model, 0) net, params = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, arg_params=mx_args, aux_params=mx_auxs) target = args.target if args.original: # run original model with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, target, params=params) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx # constant folding and scale folding. # print('original') # print(net.astext(show_meta_data=False)) with relay.build_config(opt_level=3): qgraph = relay.optimize(net, target, params) # print('after optimize') # print(qgraph.astext(show_meta_data=False)) with qtz.qconfig(skip_k_conv=0, nbit_input=args.nbit_input, nbit_weight=args.nbit_input, global_scale=args.global_scale, dtype_input=args.dtype_input, dtype_weight=args.dtype_input, dtype_activation=args.dtype_output, store_lowbit_output=False, debug_enabled_ops=None): print(qtz.current_qconfig()) qgraph = qtz.annotate(qgraph) # print('after annotate') # print(qgraph.astext(show_meta_data=False)) qgraph = qtz.calibrate(qgraph) # print('after calibrate\n') # print(qgraph.astext(show_meta_data=False)) if not args.simulated: qgraph = qtz.realize(qgraph) qgraph = relay.ir_pass.infer_type(qgraph) # print('after realize\n') # print(qgraph.astext(show_meta_data=False)) with relay.build_config(opt_level=3): graph, lib, params = relay.build(qgraph, target) ### save/load the graph, lib and params into separate files # save lib.export_library(os.path.join(thisdir, "deploy_lib.so")) with open(os.path.join(thisdir, "deploy_graph.json"), "w") as fo: fo.write(graph) with open(os.path.join(thisdir, "deploy_param.params"), "wb") as fo: fo.write(relay.save_param_dict(params)) # load graph = open(os.path.join(thisdir, "deploy_graph.json")).read() lib = tvm.module.load(os.path.join(thisdir, "deploy_lib.so")) params = bytearray( open(os.path.join(thisdir, "deploy_param.params"), "rb").read()) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx