def init_embedding_processor(): global mod2 global mod3 if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2'): global __t global graph_runtime import tvm as __t from tvm.contrib import graph_runtime loaded_lib = None if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2.tar.so'): loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar.so') else: loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar') loaded_json = open(DATA_RUNTIME_FOLDER + "/net2").read() loaded_params = bytearray( open(DATA_RUNTIME_FOLDER + "/net2.params", "rb").read()) ctx = __t.cl(0) mod2 = graph_runtime.create(loaded_json, loaded_lib, ctx) mod2.load_params(loaded_params) return mod2 elif os.path.isfile('/root/model-r50-am-lfw/model-0000.params'): global mx import mxnet as mx ctx = mx.cpu(0) mod3 = get_model(ctx, [112, 112], '/root/model-r50-am-lfw/model,0', 'fc1') print('no existing model, nothing to do') return mod3
def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal( size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal( size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. tstart = time.time() flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A, B], device, name="sum") # Test in_npy = np.random.uniform(size=in_shape).astype(np.float32) in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32) if type == "sum": out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) elif type == "argmax": out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) elif type == "argmin": out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=ctx) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
class ModelAgent: ctx = tvm.cl(0) dtype = 'float32' def __init__(self): self.graph = open('shufflenet.json').read() self.lib = tvm.module.load("shufflenet.tar") self.params = bytearray(open("shufflenet.params", "rb").read()) # Compute with GPU self.mod = graph_runtime.create(self.graph, self.lib, self.ctx) self.mod.load_params(self.params) def preprocess_image(self, image): image = image.resize((224, 224)) image = np.array(image) / np.array([255, 255, 255]) image -= np.array([0.485, 0.456, 0.406]) image /= np.array([0.229, 0.224, 0.225]) image = image.transpose((2, 0, 1)) image = image[np.newaxis, :] return image def execute(self, inputs): inputs = self.preprocess_image(inputs) self.mod.set_input("input", tvm.nd.array(inputs.astype(self.dtype))) self.mod.run() outputs = self.mod.get_output(0) return outputs
def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. tstart = time.time() flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap)
def init_embedding_processor(): global mod2 global mod3 if HAS_OPENCL == 'false': global mx import mxnet as mx print('need init mxnet') mod2 = None if os.path.isfile(DATA_RUNTIME_FOLDER + '/model-0000.params'): ctx = mx.cpu(0) mod3 = get_model(ctx, [112, 112], DATA_RUNTIME_FOLDER + '/model,0', 'fc1') print('backup model loaded') return mod3 else: print('has opencl supporting') if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2'): global __t global graph_runtime try: import tvm as __t from tvm.contrib import graph_runtime loaded_lib = None if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2.tar.so'): loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar.so') else: loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar') loaded_json = open(DATA_RUNTIME_FOLDER + "/net2").read() loaded_params = bytearray( open(DATA_RUNTIME_FOLDER + "/net2.params", "rb").read()) ctx = __t.cl(0) mod2 = graph_runtime.create(loaded_json, loaded_lib, ctx) mod2.load_params(loaded_params) return mod2 except: print('error of loading net2') mod2 = None if os.path.isfile(DATA_RUNTIME_FOLDER + '/model-0000.params'): global mx import mxnet as mx ctx = mx.cpu(0) mod3 = get_model(ctx, [112, 112], DATA_RUNTIME_FOLDER + '/model,0', 'fc1') print('backup model loaded') return mod3 elif os.path.isfile('/root/model-r50-am-lfw/model-0000.params'): global mx import mxnet as mx ctx = mx.cpu(0) mod3 = get_model(ctx, [112, 112], '/root/model-r50-am-lfw/model,0', 'fc1') print('backup model loaded') return mod3
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A, B], device, name="expand_dims") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = data_npy.reshape(out_shape) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A, B], device, name="reshape") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npy = np.reshape(data_npy, newshape=dst_shape) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype) foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A, B], device, name="tranpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype( A.dtype) out_npy = data_npy.transpose(axes) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype) foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def __init__(self): ctx = tvm.cl(0) ffi = FFI() DATA_RUNTIME_FOLDER = os.getenv('DATA_RUNTIME_FOLDER', '/data/runtime') self.darknet_lib = __darknetffi__.dlopen(DATA_RUNTIME_FOLDER + '/model/yolo/libdarknet.so') self.net = self.darknet_lib.load_network(DATA_RUNTIME_FOLDER + "/model/yolo/yolo.cfg", ffi.NULL, 0) lib = tvm.module.load(DATA_RUNTIME_FOLDER + '/model/yolo/yolo.tar') graph = open(DATA_RUNTIME_FOLDER + "/model/yolo/yolo").read() params = bytearray(open(DATA_RUNTIME_FOLDER + "/model/yolo/yolo.params", "rb").read()) self.mod = graph_runtime.create(graph, lib, ctx) self.mod.load_params(params)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate") data_npys = [ np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes ] out_npy = np.concatenate(data_npys, axis=axis) data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys] out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype) foo(*(data_nds + [out_nd])) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def __init__(self): ctx = tvm.cl(0) self.darknet_lib = __darknetffi__.dlopen( '../../model/yolo/libdarknet.so') lib = tvm.module.load('../../model/yolo/yolov2.tar') graph = open("../../model/yolo/yolov2").read() params = bytearray(open("../../model/yolo/yolov2.params", "rb").read()) self.mod = graph_runtime.create(graph, lib, ctx) self.mod.load_params(params) print("mod load params successfully") self.parked_car_boxes = None self.free_space_frames = 0 self.frame_index = 0
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A] + tensor_l, device, name="split") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npys = np.split(data_npy, indices_or_sections, axis=axis) data_nd = tvm.nd.array(data_npy, ctx) out_nds = [ tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys ] foo(*([data_nd] + out_nds)) for out_nd, out_npy in zip(out_nds, out_npys): np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) for i in range(2): f(a, b, c) np.testing.assert_allclose( c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=32, auto_unroll_min_depth=0, unroll_explicit=False): func1 = tvm.build(s1, [A, W, B], device) func1(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) func2 = tvm.build(s2, [A, W, C], device) func2(a, w, c) np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) dev = tvm.cuda(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, dev) scan_c_a = tvm.nd.array(scan_c_np, dev) Xi2h_a = tvm.nd.array(Xi2h_np, dev) Wh2h_a = tvm.nd.array(Wh2h_np, dev) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) dev.sync() # measure time cost of second step. evaluator = flstm.time_evaluator(flstm.entry_name, dev, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean)
def check_device(target): with tvm.transform.PassContext( config={ "tir.UnrollLoop": { "auto_max_step": 128, }, "tir.detect_global_barrier": detect_global_barrier, }): f = tvm.build(s, [s_scan, Whh], target) dev = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden // 2:] = 0 res_a = tvm.nd.array(res_np, dev) Whh_a = tvm.nd.array(Whh_np, dev) # Skip first pass as it is compilation f(res_a, Whh_a) dev.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) dev.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i, 0, j] - res_gpu[i, 0, j]) > 1e-5: print("%d, %d: %g vs %g" % (i, j, res_cmp[i, 0, j], res_gpu[i, 0, j])) tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
def check_device(target): with tvm.build_config( detect_global_barrier=detect_global_barrier, auto_unroll_max_step=128, unroll_explicit=False): f = tvm.build(s, [s_scan, Whh], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden//2:] = 0 res_a = tvm.nd.array(res_np, ctx) Whh_a = tvm.nd.array(Whh_np, ctx) # Skip first pass as it is compilation f(res_a, Whh_a) ctx.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5: print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j])) tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal( size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal( size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean)
def opencl_add(): n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") print(type(C)) s = tvm.create_schedule(C.op) bx, tx = s[C].split(C.op.axis[0], factor=64) s[C].bind(bx, tvm.thread_axis("blockIdx.x")) s[C].bind(tx, tvm.thread_axis("threadIdx.x")) fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd") print("------opencl code------") print(fadd_cl.imported_modules[0].get_source()) ctx = tvm.cl(0) n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd_cl(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
# ###################################################################### # Generate OpenCL Code # -------------------- # TVM provides code generation features into multiple backends, # we can also generate OpenCL code or LLVM code that runs on CPU backends. # # The following code blocks generate OpenCL code, creates array on an OpenCL # device, and verifies the correctness of the code. # if tgt.startswith("opencl"): fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") print("------opencl code------") print(fadd_cl.imported_modules[0].get_source()) ctx = tvm.cl(0) n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd_cl(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) ###################################################################### # Summary # ------- # This tutorial provides a walk through of TVM workflow using # a vector add example. The general workflow is # # - Describe your computation via a series of operations. # - Describe how we want to compute use schedule primitives.
# any GPUs, provided that you have compiled the code for that GPU. ################################################################################ # Generate OpenCL Code # -------------------- # TVM provides code generation features into multiple backends. We can also # generate OpenCL code or LLVM code that runs on CPU backends. # # The following code blocks generate OpenCL code, creates array on an OpenCL # device, and verifies the correctness of the code. if tgt.kind.name.startswith("opencl"): fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") print("------opencl code------") print(fadd_cl.imported_modules[0].get_source()) dev = tvm.cl(0) n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd_cl(a, b, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) ################################################################################ # .. note:: TE Scheduling Primitives # # TVM includes a number of different scheduling primitives: # # - split: splits a specified axis into two axises by the defined factor. # - tile: tiles will split a computation across two axes by the defined factors. # - fuse: fuses two consecutive axises of one computation.
sys.path.insert(0, os.path.join(thisdir, 'tvm_local/nnvm/python')) sys.path.insert(0, os.path.join(thisdir, 'tvm_local/topi/python')) import mxnet as mx import nnvm import tvm import numpy as np import time print(mx.__file__) print(nnvm.__file__) print(tvm.__file__) target = 'opencl' target_to_device = { 'opencl': tvm.cl(0), 'llvm': tvm.cpu(0), 'cuda': tvm.gpu(0), } ###################################################################### # Download Resnet18 model from Gluon Model Zoo # --------------------------------------------- # In this section, we download a pretrained imagenet model and classify an image. # from mxnet.gluon.model_zoo.vision import get_model from symbols.mobilenetv2 import get_symbol from PIL import Image from matplotlib import pyplot as plt model_name = 'models/mobilenetv2-1_0' img_name = 'data/cat.jpg'
def run_case(dtype, image): # Check image import os import json import sys STAT_REPEAT=os.environ.get('STAT_REPEAT','') if STAT_REPEAT=='' or STAT_REPEAT==None: STAT_REPEAT=10 STAT_REPEAT=int(STAT_REPEAT) # FGG: set model files via CK env CATEG_FILE = '../synset.txt' synset = eval(open(os.path.join(CATEG_FILE)).read()) files=[] val={} if image!=None and image!='': files=[image] else: ipath=os.environ.get('CK_ENV_DATASET_IMAGENET_VAL','') if ipath=='': print ('Error: path to ImageNet dataset is not set!') exit(1) if not os.path.isdir(ipath): print ('Error: path to ImageNet dataset was not found!') exit(1) # get all files d=os.listdir(ipath) for x in d: x1=x.lower() if x1.startswith('ilsvrc2012_val_'): files.append(os.path.join(ipath,x)) files=sorted(files) STAT_REPEAT=1 # Get correct labels ival=os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT','') fval=open(ival).read().split('\n') val={} for x in fval: x=x.strip() if x!='': y=x.split(' ') val[y[0]]=int(y[1]) # FGG: set timers import time timers={} # Get first shape (expect that will be the same for all) dt=time.time() image = Image.open(os.path.join(files[0])).resize((224, 224)) if image.mode!='RGB': image=image.convert('RGB') timers['execution_time_load_image']=time.time()-dt dt=time.time() img = transform_image(image) timers['execution_time_transform_image']=time.time()-dt # load model from mxnet.gluon.model_zoo.vision import get_model from mxnet.gluon.utils import download model_path=os.environ['CK_ENV_MODEL_MXNET'] model_id=os.environ['MXNET_MODEL_ID'] block = get_model(model_id, pretrained=True, root=model_path) # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon net, params = nnvm.frontend.from_mxnet(block) # we want a probability so add a softmax operator net = nnvm.sym.softmax(net) # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3) if dtype!='float32': params = {k: tvm.nd.array(v.asnumpy().astype(dtype)) for k, v in params.items()} # compile opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build( net, tvm.target.mali(), shape={"data": data_shape}, params=params, dtype=dtype, target_host=None) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) ctx = tvm.cl(0) rlib = lib rparams = params # create graph runtime dt=time.time() module = runtime.create(graph, rlib, ctx) module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) timers['execution_time_create_run_time_graph']=(time.time()-dt) total_images=0 correct_images_top1=0 correct_images_top5=0 # Shuffle files and pre-read JSON with accuracy to continue aggregating it # otherwise if FPGA board hangs, we can continue checking random images ... import random random.shuffle(files) if len(files)>1 and os.path.isfile('aggregate-ck-timer.json'): x=json.load(open('aggregate-ck-timer.json')) if 'total_images' in x: total_images=x['total_images'] if 'correct_images_top1' in x: correct_images_top1=x['correct_images_top1'] if 'correct_images_top5' in x: correct_images_top5=x['correct_images_top5'] dt1=time.time() for f in files: total_images+=1 print ('===============================================================================') print ('Image '+str(total_images)+' of '+str(len(files))+' : '+f) image = Image.open(os.path.join(f)).resize((224, 224)) if image.mode!='RGB': image=image.convert('RGB') img = transform_image(image) # set inputs module.set_input('data', tvm.nd.array(img.astype(dtype))) module.set_input(**rparams) # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, 1) warm_up_timer() # execute print ('') print ("run ("+str(STAT_REPEAT)+" statistical repetitions)") dt=time.time() timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT) tcost = timer() timers['execution_time_classify']=(time.time()-dt)/STAT_REPEAT # get outputs tvm_output = module.get_output( 0, tvm.nd.empty((1000,), dtype, ctx)) top1 = np.argmax(tvm_output.asnumpy()) top5=[] atop5 = get_top5(tvm_output.asnumpy()) print ('') print('TVM prediction Top1:', top1, synset[top1]) print ('') print('TVM prediction Top5:') for q in atop5: x=q[1] y=synset[x] top5.append(x) print (x,y) print ('') print("Internal T-cost: %g" % tcost.mean) # Check correctness if available if len(val)>0: top=val[os.path.basename(f)] correct_top1=False if top==top1: correct_top1=True correct_images_top1+=1 print ('') if correct_top1: print ('Current prediction Top1: CORRECT') else: print ('Current prediction Top1: INCORRECT +('+str(top)+')') accuracy_top1=float(correct_images_top1)/float(total_images) print ('Current accuracy Top1: '+('%.5f'%accuracy_top1)) correct_top5=False if top in top5: correct_top5=True correct_images_top5+=1 print ('') if correct_top5: print ('Current prediction Top5: CORRECT') else: print ('Current prediction Top5: INCORRECT +('+str(top)+')') accuracy_top5=float(correct_images_top5)/float(total_images) print ('Current accuracy Top5: '+('%.5f'%accuracy_top5)) print ('') print ('Total elapsed time: '+('%.1f'%(time.time()-dt1))+' sec.') timers['total_images']=total_images timers['correct_images_top1']=correct_images_top1 timers['accuracy_top1']=accuracy_top1 timers['correct_images_top5']=correct_images_top5 timers['accuracy_top5']=accuracy_top5 timers['execution_time_classify_internal']=tcost.mean timers['execution_time']=tcost.mean with open ('tmp-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) with open ('aggregate-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) sys.stdout.flush() return
if os.path.isfile(test_image_npy): print("File {} exists, skip image preprocessing.".format(test_image_npy)) img_data = np.load(test_image_npy) else: import cv2 test_image_path = test_image image = cv2.imread(test_image_path) img_data = cv2.resize(image, (dshape[2], dshape[3])) img_data = img_data[:, :, (2, 1, 0)].astype(np.float32) img_data -= np.array([123, 117, 104]) img_data = np.transpose(np.array(img_data), (2, 0, 1)) img_data = np.expand_dims(img_data, axis=0) np.save(test_image_npy, img_data.astype(dtype)) ctx = tvm.cl() target = "opencl" #base = "deploy_ssd_resnet50_512/{}/".format(target) #base = "deploy_ssd_inceptionv3_512/{}/".format(target) #base = "deploy_ssd_mobilenet_512/{}/".format(target) #base = "deploy_ssd_mobilenet_608/{}/".format(target) #base = "cpu-model/" base = "./" path_lib = base + "model.so" path_graph = base + "model.json" path_param = base + "model.params" graph = open(path_graph).read() params = bytearray(open(path_param, "rb").read()) lib = tvm.module.load(path_lib)
# ###################################################################### # Generate OpenCL Code # -------------------- # TVM provides code generation features into multiple backends, # we can also generate OpenCL code or LLVM code that runs on CPU backends. # # The following codeblocks generate opencl code, creates array on opencl # device, and verifies the correctness of the code. # if tgt == "opencl": fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd") print("------opencl code------") print(fadd_cl.imported_modules[0].get_source()) ctx = tvm.cl(0) n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd_cl(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) ###################################################################### # Summary # ------- # This tutorial provides a walk through of TVM workflow using # a vector add example. The general workflow is # # - Describe your computation via series of operations. # - Describe how we want to compute use schedule primitives.
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3 * 1e6)) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[ c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success")
def run_case(model, dtype): # load model if model == 'vgg16': net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=1, image_shape=image_shape, dtype=dtype) elif model == 'resnet18': net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=1, image_shape=image_shape, dtype=dtype) elif model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape, dtype=dtype) else: raise ValueError('no benchmark prepared for {}.'.format(model)) # compile opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, tvm.target.mali(), shape={"data": data_shape}, params=params, dtype=dtype, target_host=args.target_host) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) if args.host is not None: remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cl(0) rlib = remote.load_module('net.tar') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} else: ctx = tvm.cl(0) rlib = lib rparams = params # create graph runtime module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) # benchmark # print("============================================================") # print("model: %s, dtype: %s" % (model, dtype)) # the num of runs for warm up and test num_warmup = 10 num_test = 60 if model == 'mobilenet': # mobilenet is fast, need more runs for stable measureament num_warmup *= 5 num_test *= 5 # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, num_warmup) warm_up_timer() # test # print("test..") ftimer = module.module.time_evaluator("run", ctx, num_test) prof_res = ftimer() # print("cost per image: %.4fs" % prof_res.mean) print("backend: TVM-mali\tmodel: %s\tdtype: %s\tcost:%.4f" % (model, dtype, prof_res.mean))
remote=None): for item in workloads: cost, gflops = verify_conv2d_nchw(*item, ctx=ctx, target=target, target_host=target_host, remote=remote) print("%-30s %.6f %.6f" % (item, cost, gflops)) #def tune_workloads(ctx, n_times=1, target=None, target_host=None, remote=None): # ret = [] # for item in workloads: # cost, gflops, config = tune_conv2d_nchw(*item, ctx=ctx, target_host=target_host, remote=remote) # print(item, cost, gflops, config) # ret.append([item, config]) # for item in ret: # print(item, config) if __name__ == "__main__": host = os.environ["TVM_OPENCL_DEVICE_HOST"] port = 9090 #remote = rpc.connect(host, port) #target_host = "llvm -target=aarch64-linux-gnu -mattr=+neon" target_host = None #verify_workloads(remote.cl(), 1000, tvm.target.mali(), target_host, remote) verify_workloads(tvm.cl(), 1000, tvm.target.create("opencl -device=mercytest"), target_host, None)