def test_dense(): shape = (16, 1024) weight_shape = (256, 1024) bias_shape = (256, ) inputs = nnvm.symbol.Variable("inputs") weights = nnvm.symbol.Variable("weights") bias = nnvm.symbol.Variable("bias") env = nnpu.get_env() target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) z = nnvm.symbol.dense(data=inputs, weight=weights, use_bias=0, units=256) z1 = nnvm.symbol.relu(z) compute_graph = nnvm.graph.create(z1) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs": shape, "weights": weight_shape }, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='SC') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs": shape, "weights": weight_shape }, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=shape) b_np = np.random.random(size=weight_shape) m.set_input(**{"inputs": a_np, "weights": b_np}) m.run() gt = a_np.dot(b_np.transpose()) out = m.get_output(0, out=tvm.nd.empty((16, 256))) np.testing.assert_allclose(out.asnumpy(), gt, rtol=5e-5) print("tests") print(out) print(compute_graph.ir()) print(deploy_graph.ir())
def test_elemwise_mul(): env = nnpu.get_env() device = "nnpu" target_host = "llvm" target = tvm.target.create("llvm -device={}".format(device)) inputs1 = nnvm.symbol.Variable("inputs1") inputs2 = nnvm.symbol.Variable("inputs2") shape = (16, 6, 16) z = nnvm.symbol.elemwise_mul(inputs1, inputs2) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs1": shape, "inputs2": shape }, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs1": shape, "inputs2": shape }, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random((16, 6, 16)) b_np = np.random.random((16, 6, 16)) print("a_np : ") print(a_np) print("b_np : ") print(b_np) m.set_input(**{"inputs1": a_np, "inputs2": b_np}) gt = (a_np.astype("float32") * b_np.astype("float32")).astype("float32") m.run() out = m.get_output(0, out=tvm.nd.empty((16, 6, 16))) np.testing.assert_allclose(out.asnumpy(), gt) print("elemwise_mul tests success") print(out)
def test_batch_norm(): input_shape = (1, 4, 4, 16) target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) inputs1 = nnvm.symbol.Variable("inputs1") inputs2 = nnvm.symbol.Variable("inputs2") z1 = nnvm.symbol.relu(inputs1) # z2 = nnvm.symbol.relu(z1) compute_graph = nnvm.graph.create(z1) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target, shape = {"inputs1" : input_shape}, dtype = "float32", target_host = target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type = 'S0') deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target, shape = {"inputs1" : input_shape}, dtype = "float32", target_host = target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size = (1, 4, 4, 16), low = -32, high = 32).astype(np.float32) b_np = np.random.uniform(size = (1, 16), low = -32, high = 32).astype(np.float32) print(a_np) module.set_input(inputs1 = a_np) module.run() out = module.get_output(0, out = tvm.nd.empty((1, 4, 4, 16))) print(out.asnumpy) print(compute_graph.ir()) print(deploy_graph.ir())
def test(): env = nnpu.get_env() a = tvm.placeholder((4, 16), env.cfg['dtype_w'], 'a') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) k = tvm.reduce_axis((0, 16), 'k') c_buf = tvm.compute((4, 1), lambda i, j: tvm.sum(a_buf[i,k], axis=k), 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) k1 = tvm.reduce_axis((0, 16), 'k1') max_buf = tvm.compute((4, 1), lambda i, j: tvm.max(a_buf[i,k1], axis=k1), 'max_buf') sph.MarkScope(max_buf) max_host, max_dram = nnpu.utils.CopyBufToH(max_buf, 'max', sph) k2 = tvm.reduce_axis((0, 16), 'k2') min_buf = tvm.compute((4, 1), lambda i, j: tvm.min(a_buf[i,k2], axis=k2), 'min_buf') sph.MarkScope(min_buf) min_host, min_dram = nnpu.utils.CopyBufToH(min_buf, 'min', sph) # create schedule and tensorize s = tvm.create_schedule([c_host.op, max_host.op, min_host.op]) sph.Transform(s) s[c_buf].tensorize(s[c_buf].op.axis[1], env.intrins.get('VReduceSum', mode='w')) s[max_buf].tensorize(s[max_buf].op.axis[1], env.intrins.get('VReduceMax', mode='w')) s[min_buf].tensorize(s[min_buf].op.axis[1], env.intrins.get('VReduceMin', mode='w')) # build print(nnpu.lower(s, [a, c_host, max_host, min_host], simple_mode=True)) func = nnpu.build(s, [a, c_host, max_host, min_host], 'nnpu', 'llvm', name='nnpu_func') # create data and run ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low = 0, high = 64) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) c_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx) max_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx) min_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx) func(a_nd, c_nd, max_nd, min_nd) # check results gt = np.sum(a_np, axis=(1,), keepdims=True) np.testing.assert_allclose(c_nd.asnumpy(), gt) np.testing.assert_allclose(max_nd.asnumpy(), np.max(a_np, axis=(1,), keepdims=True)) np.testing.assert_allclose(min_nd.asnumpy(), np.min(a_np, axis=(1,), keepdims=True)) print('test passed')
def test(): env = nnpu.get_env() shape = (8, 16) a = tvm.placeholder(shape, env.cfg['dtype_n'], 'a') b = tvm.placeholder(shape, env.cfg['dtype_n'], 'b') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) dtype_w = env.cfg['dtype_w'] k = tvm.reduce_axis((0, 16), 'k') dot_buf = tvm.compute((8, ), lambda i: tvm.sum( a_buf[i, k].astype(dtype_w) * b_buf[i, k].astype(dtype_w), k), 'dot_buf') sph.MarkScope(dot_buf) dot_host, dot_dram = nnpu.utils.CopyBufToH(dot_buf, 'sum', sph) s = tvm.create_schedule(dot_host.op) sph.Transform(s) s[dot_buf].tensorize(s[dot_buf].op.axis[0], env.intrins.get('MRowDot', shape=shape, mode='inc')) print(nnpu.lower(s, [a, b, dot_host], simple_mode=True)) func = nnpu.build(s, [a, b, dot_host], 'nnpu', 'llvm', name='nnpu_func') print('------------------- device module 1 llvm IR: ') print(func.imported_modules[0].get_source('ll')) print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(8, 16), dtype=a.dtype, low=-32, high=32) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=(8, 16), dtype=b.dtype, low=-32, high=32) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros((8, )).astype(dot_host.dtype), ctx) func(a_nd, b_nd, c_nd) #print('a = ') #print(a_np) #print('b = ') #print(b_np) print(c_nd.asnumpy()) print('ground truth is') gt = np.multiply(a_np, b_np, dtype=dot_host.dtype) gt = np.sum(gt, axis=1) print(gt) np.testing.assert_allclose(c_nd.asnumpy(), gt)
def test_log(): env = nnpu.get_env() shape = (1, 22, 22, 16) device = "nnpu" target_host = "llvm" target = tvm.target.create("llvm -device={}".format(device)) inputs = nnvm.symbol.Variable("inputs") z = nnvm.symbol.log(inputs) z1 = nnvm.symbol.exp(z) compute_graph = nnvm.graph.create(z1) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(shape) print(a_np) m.set_input(**{"inputs": a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty(shape)) gt = np.exp(np.log( a_np.astype("float32")).astype("float32")).astype("float32") print(out) np.testing.assert_allclose(out.asnumpy(), gt) print("log tests success") print(compute_graph.ir()) print(deploy_graph.ir())
def test_relu(): shape = (2, 16) inputs = nnvm.symbol.Variable("inputs") env = nnpu.get_env() target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) z = nnvm.symbol.relu(inputs) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=(2, 16)).astype("float32") - 0.5 m.set_input(**{'inputs': a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty((2, 16))) print(a_np) print(out.dtype) print(out) np.testing.assert_allclose(out.asnumpy(), np.maximum(a_np, 0)) print("tests") print(compute_graph.ir()) print(deploy_graph.ir())
def test_conv2d(): input_shape = (1, 16, 10, 64) target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) inputs = nnvm.symbol.Variable("inputs") inputs1 = nnvm.symbol.Variable("inputs1") z1 = nnvm.symbol.conv2d(data=inputs, channels=64, kernel_size=(3, 3), padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI') z2 = nnvm.symbol.sigmoid(z1) z = nnvm.symbol.elemwise_add(z2, inputs1) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs": input_shape, "inputs1": (1, 14, 8, 64) }, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='SC') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=input_shape, low=-32, high=32).astype(np.float32) b_np = np.random.uniform(size=(1, 14, 8, 64), low=-32, high=32).astype(np.float32) module.set_input(inputs=a_np) module.run() print(deploy_graph.ir()) out = module.get_output(0, out=tvm.nd.empty((1, 14, 8, 64)))
def test_onemore(): shape = (1, 32, 32, 16) inputs = nnvm.symbol.Variable("inputs") env = nnpu.get_env() target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) z1 = nnvm.symbol.relu(inputs) z = nnvm.symbol.sqrt(z1) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=(1, 32, 32, 16)) m.set_input(**{'inputs': a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty((1, 32, 32, 16))) print(out) print(compute_graph.ir()) print(deploy_graph.ir())
def test(): env = nnpu.get_env() shape = (16, 16) flatten_shape = (shape[0] * shape[1],) a = tvm.placeholder(flatten_shape, env.cfg['dtype_n'], 'a') b = tvm.placeholder(flatten_shape, env.cfg['dtype_n'], 'b') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) sum_buf = tvm.compute(flatten_shape, lambda i: a_buf[i] + b_buf[i], 'sum_buf') sph.MarkScope(sum_buf) sum_host, sum_dram = nnpu.utils.CopyBufToH(sum_buf, 'sum', sph) s = tvm.create_schedule([sum_host.op]) sph.Transform(s) xo, xi = s[sum_buf].split(sum_buf.op.axis[0], 16) s[sum_buf].tensorize(xo, env.intrins.get('MAddM', shape=shape, mode='n')) print(nnpu.lower(s, [a, b, sum_host], simple_mode=True)) func = nnpu.build(s, [a, b, sum_host], 'nnpu', 'llvm', name='nnpu_exp') print('------------------- device module 1 llvm IR: ') print(func.imported_modules[0].get_source('ll')) print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=flatten_shape, dtype=a.dtype, low = 0, high = 23) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=flatten_shape, dtype=b.dtype, low = 0, high = 23) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros(flatten_shape).astype(sum_host.dtype), ctx) func(a_nd, b_nd, c_nd) print('a = ') print(a_np) print('b = ') print(b_np) print('a + b = ') print(c_nd.asnumpy()) print("numpy ground truth is") print(a_np + b_np) np.testing.assert_allclose(c_nd.asnumpy(), a_np + b_np)
def test(): env = nnpu.get_env() a = tvm.placeholder((16,16), env.cfg['dtype_n'], 'a') sph = ScheduleProcHelper() Imm = tvm.const(7, env.cfg['dtype_n']) a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) add_buf = tvm.compute((16,16), lambda i,j: Imm+a_buf[i][j], 'add_buf') sph.MarkScope(add_buf) add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph) dtype_w = env.cfg['dtype_w'] mul_buf = tvm.compute((16,16), lambda i,j: a_buf[i][j].astype(dtype_w) * Imm.astype(dtype_w), 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) rsub_buf = tvm.compute((16,16), lambda i,j: Imm-a_buf[i][j], 'rsub_buf') sph.MarkScope(rsub_buf) rsub_host, rsub_dram = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph) s = tvm.create_schedule([add_host.op,mul_host.op,rsub_host.op]) sph.Transform(s) s[add_buf].tensorize(s[add_buf].op.axis[0], env.intrins.get('MAddI', shape=(16,16), imm_value=Imm.value, mode='n')) s[mul_buf].tensorize(s[mul_buf].op.axis[0], env.intrins.get('MMulI', shape=(16,16), imm_value=Imm.value, mode='inc')) s[rsub_buf].tensorize(s[rsub_buf].op.axis[0], env.intrins.get('ISubM', shape=(16,16), imm_value=Imm.value, mode='n')) print(nnpu.lower(s, [a,add_host,mul_host,rsub_host], simple_mode=True)) func = nnpu.build(s, [a,add_host,mul_host,rsub_host], 'nnpu', 'llvm', name='nnpu_vmuli') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(16,16), dtype=a.dtype, low = 3, high = 100) a_nd = tvm.nd.array(a_np, ctx) add_nd = tvm.nd.array(np.zeros((16,16)).astype(add_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((16,16)).astype(mul_host.dtype), ctx) rsub_nd = tvm.nd.array(np.zeros((16,16)).astype(rsub_host.dtype), ctx) func(a_nd, add_nd,mul_nd,rsub_nd) print(a_nd.asnumpy()) print('add result is: ') print(add_nd.asnumpy()) np.testing.assert_allclose(add_nd.asnumpy(), a_np + Imm.value) print('mul result is: ') print(mul_nd.asnumpy()) np.testing.assert_allclose(mul_nd.asnumpy(), a_np.astype(dtype_w) * Imm.value) print('rsub result is: ') print(rsub_nd.asnumpy()) np.testing.assert_allclose(rsub_nd.asnumpy(), Imm.value - a_np ) print('test passed')
def test(): env = nnpu.get_env() shape = (16, ) bigshape = (128, ) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] assert bigshape[0] % shape[0] == 0, 'the big vctr size is wrong' n_sheet = bigshape[0] // shape[0] sph = ScheduleProcHelper() a = tvm.placeholder(bigshape, dtype_n, 'a') b = tvm.placeholder(bigshape, dtype_n, 'b') a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) strop = 'VAddV' c_buf = tvm.compute(bigshape, lambda *i: a_buf(*i) + b_buf(*i), 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'sum', sph) s = tvm.create_schedule(c_host.op) sph.Transform(s) #tensorize xo, xi = s[c_buf].split(c_buf.op.axis[0], factor=shape[0]) s[c_buf].reorder(xo, xi) s[c_buf].tensorize(xi, env.intrins.get(strop, mode='n')) print(nnpu.lower(s, [a, b, c_host], simple_mode=True)) func = nnpu.build(s, [a, b, c_host], 'nnpu', 'llvm', name='nnpu_func') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=bigshape, dtype=a.dtype, low=-4, high=4) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=bigshape, dtype=b.dtype, low=-4, high=4) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros(bigshape, dtype=c_host.dtype), ctx) func(a_nd, b_nd, c_nd) print(strop) print(c_nd.asnumpy()) gt = a_np + b_np np.testing.assert_allclose(c_nd.asnumpy(), gt)
def test_max_pool2d(): device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) target_host = "llvm" inputs = nnvm.symbol.Variable("inputs") shape = (1, 224, 224, 16) kernels = nnvm.symbol.Variable("kernels") kernel_shape = (2, 2) z = nnvm.symbol.avg_pool2d(inputs, pool_size=(2, 2), strides=(1, 1), layout="NHWC") compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32") else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32") ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=(1, 224, 224, 16)) m.set_input(**{"inputs": a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty((1, 223, 223, 16))) gt = avg_pooling((1, 224, 224, 16), (1, 223, 223, 16), (2, 2), a_np, (1, 1), "float32") np.testing.assert_allclose(out.asnumpy(), gt, rtol=5e-7) print("max_pool2d tests success") print(gt) print(out) print("end")
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (16, ) bigshape = (4, 64) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] sph = ScheduleProcHelper() a = tvm.placeholder(bigshape, dtype_n, 'a') a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) str_op = 'VAddMerge' k = tvm.reduce_axis((0, 4), 'k') c_buf = tvm.compute((64, ), lambda i: tvm.sum(a_buf[k, i], axis=k), 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) s = tvm.create_schedule(c_host.op) sph.Transform(s) #tensorize ko, ki = s[c_buf].split(c_buf.op.reduce_axis[0], factor=1) xo, xi = s[c_buf].split(c_buf.op.axis[0], factor=shape[0]) s[c_buf].reorder(xo, ko, ki, xi) #s[c_buf].tensorize(ki, env.intrins.get(str_op, mode='n')) print(nnpu.lower(s, [a, c_host], simple_mode=True)) exit() func = nnpu.build(s, [a, c_host], 'nnpu', 'llvm', name='nnpu_func') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=bigshape, dtype=a.dtype, low=-4, high=4) a_nd = tvm.nd.array(a_np, ctx) c_nd = tvm.nd.array(np.zeros((64, ), dtype=c_host.dtype), ctx) func(a_nd, c_nd) print(str_op) print(c_nd.asnumpy()) gt = np.sum(a_np, axis=0, dtype=dtype_w) print('ground truth=') print(gt) np.testing.assert_allclose(c_nd.asnumpy(), gt)
def test(): env = nnpu.get_env() a = tvm.placeholder((4, 16), 'int16', 'a') b = tvm.placeholder((16, ), 'int16', 'b') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) k = tvm.reduce_axis((0, 16), 'k') c_buf = tvm.compute( (4, 1), lambda i, j: tvm.sum(a_buf[i, k] * b_buf[k], axis=k), 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) s = tvm.create_schedule(c_host.op) sph.Transform(s) print(s[c_buf]) s[c_buf].tensorize(s[c_buf].op.axis[1], env.intrins.get('VDotV', mode='w')) print(nnpu.lower(s, [a, b, c_host], simple_mode=True)) func = nnpu.build(s, [a, b, c_host], 'nnpu', 'llvm', name='nnpu_func') print('------------------- device module 1 llvm IR: ') print(func.imported_modules[0].get_source('ll')) print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=0, high=64) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=(16, ), dtype=b.dtype, low=0, high=64) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx) func(a_nd, b_nd, c_nd) print(c_nd.asnumpy()) print("numpy ground truth is") print(np.dot(a_np, b_np))
def test(): env = nnpu.get_env() nnpu.set_device(env) a = tvm.placeholder((4, 4, 16), 'int16', 'a') #b = tvm.placeholder((16, ), 'int16', 'b') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) #b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) k = tvm.reduce_axis((0, 4), 'k0') c_buf = tvm.compute((4, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) s = tvm.create_schedule(c_host.op) sph.Transform(s) ko, ki = s[c_buf].split(c_buf.op.reduce_axis[0], factor=1) s[c_buf].reorder(c_buf.op.axis[0], ko, ki, c_buf.op.axis[1]) s[c_buf].tensorize(ki, env.intrins.get('VAddMerge', mode='w', nDim=3)) print(nnpu.lower(s, [a, c_host], simple_mode=True)) func = nnpu.build(s, [a, c_host], 'nnpu', 'llvm', name='nnpu_exp') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(4, 4, 16), dtype=a.dtype, low=-4000, high=4000) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) c_nd = tvm.nd.array(np.zeros((4, 16)).astype(c_host.dtype), ctx) func(a_nd, c_nd) print(c_nd.asnumpy()) print("numpy ground truth is") gt = np.sum(a_np, axis=0) print(gt)
def test(): env = nnpu.get_env() dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] shape = (4, 16) a = tvm.placeholder(shape, dtype_n, 'a') b = tvm.placeholder((16, ), dtype_n, 'b') sph = ScheduleProcHelper() a_buf, _ = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, _ = nnpu.utils.CopyHtoBuf(b, 'b', sph) sum_buf = tvm.compute(shape, lambda i, j: a_buf[i, j] + b_buf[j], 'sum_buf') sph.MarkScope(sum_buf) sum_host, _ = nnpu.utils.CopyBufToH(sum_buf, 'sum', sph) sub_buf = tvm.compute(shape, lambda i, j: a_buf[i, j] - b_buf[j], 'sub_buf') sph.MarkScope(sub_buf) sub_host, _ = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph) mul_buf = tvm.compute( shape, lambda i, j: a_buf[i, j].astype(dtype_w) * b_buf[j].astype(dtype_w), 'sub_buf') sph.MarkScope(mul_buf) mul_host, _ = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) s = tvm.create_schedule([sum_host.op, sub_host.op, mul_host.op]) sph.Transform(s) s[sum_buf].pragma(sum_buf.op.axis[0], 'nnpu.vector', str({ 'code': 'matrix-vector', 'shape': shape })) s[sub_buf].pragma(sub_buf.op.axis[0], 'nnpu.vector', str({ 'code': 'matrix-vector', 'shape': shape })) s[mul_buf].pragma(mul_buf.op.axis[0], 'nnpu.vector', str({ 'code': 'matrix-vector', 'shape': shape })) print(nnpu.lower(s, [a, b, sum_host, sub_host, mul_host], simple_mode=True)) func = nnpu.build(s, [a, b, sum_host, sub_host, mul_host], 'nnpu', 'llvm', name='nnpu_func') print('------------------- device module 1 llvm IR: ') print(func.imported_modules[0].get_source('ir')) print('------------------- device module 1 uop code: ') print(func.imported_modules[0].get_source('uop')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=0, high=64) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=(16, ), dtype=b.dtype, low=0, high=64) b_nd = tvm.nd.array(b_np, ctx) sum_nd = tvm.nd.array(np.zeros(shape).astype(sum_host.dtype), ctx) sub_nd = tvm.nd.array(np.zeros(shape).astype(sub_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros(shape).astype(mul_host.dtype), ctx) func(a_nd, b_nd, sum_nd, sub_nd, mul_nd) gt = a_np + b_np np.testing.assert_allclose(sum_nd.asnumpy(), gt) gt = a_np - b_np np.testing.assert_allclose(sub_nd.asnumpy(), gt) gt = a_np.astype(dtype_w) * b_np np.testing.assert_allclose(mul_nd.asnumpy(), gt) print('test passed')
def test(): env = nnpu.get_env() dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] shape = (4, 64) # nvctr_unit = env.cfg['vector_unit']['size'] nvctr_unit = 32 # assert shape[0] % nvctr_unit == 0, 'error' a = tvm.placeholder(shape, dtype_n, 'a') b = tvm.placeholder(shape, dtype_n, 'b') sph = ScheduleProcHelper() b_scope = 'buffer0' a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph, dst_scope=b_scope) c_buf = tvm.compute(shape, lambda *i: a_buf(*i) + b_buf(*i), 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) mul_buf = tvm.compute( shape, lambda *i: a_buf(*i).astype(dtype_w) * b_buf(*i).astype(dtype_w), 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) gtm_buf = tvm.compute(shape, lambda *i: tvm.max(a_buf(*i), b_buf(*i)), 'gtm_buf') sph.MarkScope(gtm_buf) gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph) s = tvm.create_schedule([c_host.op, mul_host.op, gtm_host.op]) sph.Transform(s) # x = s[c_buf].fuse(*c_buf.op.axis) # xo, xi = s[c_buf].split(x, factor=nvctr_unit) params = dict() params['code'] = 'binary' params['size'] = nvctr_unit x = s[c_buf].fuse(*c_buf.op.axis) xo, xi = s[c_buf].split(x, factor=nvctr_unit) s[c_buf].pragma(xi, 'nnpu.vector', str(params)) x = s[mul_buf].fuse(*mul_buf.op.axis) xo, xi = s[mul_buf].split(x, factor=nvctr_unit) s[mul_buf].pragma(xi, 'nnpu.vector', str(params)) x = s[gtm_buf].fuse(*gtm_buf.op.axis) xo, xi = s[gtm_buf].split(x, factor=nvctr_unit) s[gtm_buf].pragma(xi, 'nnpu.vector', str(params)) print(tvm.lower(s, [a, b, c_host, mul_host, gtm_host], simple_mode=True)) print(nnpu.lower(s, [a, b, c_host, mul_host, gtm_host], simple_mode=True)) # exit() func = nnpu.build(s, [a, b, c_host, mul_host, gtm_host], 'nnpu', 'llvm', name='nnpu_exp') print('------------------- device module 1 IR: ') print(func.imported_modules[0].get_source('ir')) print('------------------- device module 1 micro code: ') print(func.imported_modules[0].get_source('uop')) # exit() ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=shape, dtype=a.dtype, low=-64, high=63) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=shape, dtype=b.dtype, low=-64, high=63) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros(shape).astype(c_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros(shape).astype(mul_host.dtype), ctx) gtm_nd = tvm.nd.array(np.zeros(shape).astype(gtm_host.dtype), ctx) # print('------------------- device module 1 llvm IR: ') # print(func.imported_modules[0].get_source('ll')) # print('------------------- device module 1 asm code: ') # print(func.imported_modules[0].get_source('asm')) func(a_nd, b_nd, c_nd, mul_nd, gtm_nd) gt = a_np + b_np np.testing.assert_allclose(c_nd.asnumpy(), gt) gt = np.multiply(a_np, b_np, dtype=mul_host.dtype) np.testing.assert_allclose(mul_nd.asnumpy(), gt) gt = np.maximum(a_np, b_np) np.testing.assert_allclose(gtm_nd.asnumpy(), gt) print('test passed!!')
def test(): pass if (False): print('-----') with ScheduleProcHelper(): env = nnpu.get_env() shape = (16, 64) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') a_buf, _ = nnpu.utils.CopyHtoBuf(a_host, 'a') vctr_shape = (64, ) b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host') b_buf, _ = nnpu.utils.CopyHtoBuf(b_host, 'b') dtype_w = env.cfg['dtype_w'] out_shape = (4, 16) k = tvm.reduce_axis((0, 16), 'k') c_buf = tvm.compute( out_shape, lambda j, i: tvm.sum(a_buf[i, j * 16 + k].astype( dtype_w) * b_buf[j * 16 + k].astype(dtype_w), axis=k)) utils.MarkScope(c_buf) c_host, _ = utils.CopyBufToH(c_buf, 'c') s = nnpu.create_schedule(c_host.op) # mark variable scopes # tensorize s[c_buf].tensorize( s[c_buf].op.axis[1], env.intrins.get('GEMM', shape=(16, 16, 1), mode='inc', reduce=True)) # build print(tvm.lower(s, [a_host, b_host, c_host], simple_mode=True)) print(nnpu.lower(s, [a_host, b_host, c_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host, c_host], 'nnpu', 'llvm', name='nnpu_exp') print('function built: ') print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=-32, high=32) # a_np = np.ones(shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=vctr_shape, dtype=b_host.dtype, low=-16, high=16) # b_np = np.ones(vctr_shape).astype(b_host.dtype) b_nd = tvm.nd.array(b_np, ctx) out_nd = tvm.nd.array(np.zeros(out_shape).astype(c_host.dtype), ctx) # run func(a_nd, b_nd, out_nd) print('run finished') print('a=') print(a_np) print('b=') print(b_np) print('out=') out_np = out_nd.asnumpy() out_np = np.sum(out_np, axis=0) print(out_np) print('numpy ground truth is: ') gt = np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)) #gt = np.greater(np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)), bias_np) print(gt) np.testing.assert_allclose(out_np, gt)
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (2, 16) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') print('a host ' + str(a_host)) a = tvm.compute(shape, lambda *i: a_host(*i), name='a') a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf') b_buf = tvm.compute( shape, lambda i, j: tvm.log(a_buf[i, j].astype(env.cfg['dtype_w'])), name='b_buf') b = tvm.compute(shape, lambda *i: b_buf(*i), name='b') b_host = tvm.compute(shape, lambda *i: b(*i), name='b_host') s = tvm.create_schedule(b_host.op) # mark variable scopes s[a].set_scope(env.dram_scope) s[b].set_scope(env.dram_scope) s[a_buf].set_scope(env.uni_scratchpad_scope) s[b_buf].set_scope(env.uni_scratchpad_scope) #print # (dir(s[b].op.body)) # mark compiler pragmas s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma) s[b_host].pragma(s[b_host].op.axis[0], env.dma_copy_pragma) s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls) s[b].pragma(s[b].op.axis[0], env.scratchpad_ls) s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0]) # tensorize s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VLOG', mode='inc')) # build print(tvm.lower(s, [a_host, b_host], simple_mode=True)) print(nnpu.lower(s, [a_host, b_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host], 'nnpu', 'llvm', name='nnpu_log') print('function built: ') #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) #??? print('i want to know:') print(ctx.exist) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=1, high=20) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(shape).astype(b_host.dtype), ctx) # run func(a_nd, b_nd) print('run finished') b_np = b_nd.asnumpy() print('a=') print(a_np) print('b=') print(b_np) print('ground truth =') gt = np.log(a_np, dtype=b_host.dtype) print(gt) np.testing.assert_allclose(b_np, gt)
import tvm import topi from nnpu.utils import ScheduleProcHelper import numpy as np import argparse parser = argparse.ArgumentParser(description='test of NNPU Op') parser.add_argument('--sim', type=str, help='the simulator to use', default='S0', choices=['S0', 'S1', 'SC']) args = parser.parse_args() env = nnpu.get_env() assert env.cfg['multi_core'], 'multi core test need multi_core switch on' nnpu.set_device(env, type=args.sim) with ScheduleProcHelper(): env = nnpu.get_env() shape1 = (8, 128) # (8, 128) reshaped & transpoased to (16, 8, 8) shape2 = (128, 128) # (128, 128) tiled to (16, 128, 8) gemm_shape = (8, 8, 8) factor = gemm_shape[1] assert shape1[1] == shape2[1], \ 'gemm do dot product between rows, so the shape[1] of inputs should match' assert shape1[0] % gemm_shape[ 0] == 0, 'gemm insn require size of input 1 be x{0}'.format( gemm_shape[0]) assert shape2[0] % gemm_shape[
def test(): env = nnpu.get_env() dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder((64, ), dtype_n, 'a') b = tvm.placeholder((1, ), dtype_n, 'b') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph) c_buf = tvm.compute((64, ), lambda i: a_buf[i] + b_buf[0], 'c_buf') sph.MarkScope(c_buf) c_host, _ = nnpu.utils.CopyBufToH(c_buf, 'c', sph) sub_buf = tvm.compute((64, ), lambda i: a_buf[i] - b_buf[0], 'sub_buf') sph.MarkScope(sub_buf) sub_host, _ = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph) rsub_buf = tvm.compute((64, ), lambda i: b_buf[0] - a_buf[i], 'rsub_buf') sph.MarkScope(rsub_buf) rsub_host, _ = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph) mul_buf = tvm.compute( (64, ), lambda i: a_buf[i].astype(dtype_w) * b_buf[0].astype(dtype_w), 'mul_buf') sph.MarkScope(mul_buf) mul_host, _ = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) div_buf = tvm.compute((64, ), lambda i: a_buf[i] / b_buf[0], 'div_buf') sph.MarkScope(div_buf) div_host, _ = nnpu.utils.CopyBufToH(div_buf, 'div', sph) rdiv_buf = tvm.compute((64, ), lambda i: b_buf[0] / a_buf[i], 'rdiv_buf') sph.MarkScope(rdiv_buf) rdiv_host, _ = nnpu.utils.CopyBufToH(rdiv_buf, 'rdiv', sph) gtm_buf = tvm.compute((64, ), lambda i: tvm.max(a_buf[i], b_buf[0]), 'gtm_buf') sph.MarkScope(gtm_buf) gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph) s = tvm.create_schedule([ c_host.op, sub_host.op, mul_host.op, rsub_host.op, div_host.op, rdiv_host.op, gtm_host.op ]) sph.Transform(s) xo, xi = s[c_buf].split(c_buf.op.axis[0], 16) s[c_buf].tensorize(xi, env.intrins.get('VAddS', mode='n')) xo, xi = s[sub_buf].split(sub_buf.op.axis[0], 16) s[sub_buf].tensorize(xi, env.intrins.get('VSubS', mode='n')) xo, xi = s[rsub_buf].split(rsub_buf.op.axis[0], 16) s[rsub_buf].tensorize(xi, env.intrins.get('SSubV', mode='n')) xo, xi = s[mul_buf].split(mul_buf.op.axis[0], 16) s[mul_buf].tensorize(xi, env.intrins.get('VMulS', mode='inc')) xo, xi = s[div_buf].split(div_buf.op.axis[0], 16) s[div_buf].tensorize(xi, env.intrins.get('VDivS', mode='n')) xo, xi = s[rdiv_buf].split(rdiv_buf.op.axis[0], 16) s[rdiv_buf].tensorize(xi, env.intrins.get('SDivV', mode='n')) xo, xi = s[gtm_buf].split(gtm_buf.op.axis[0], 16) s[gtm_buf].tensorize(xi, env.intrins.get('VGTMS', mode='n')) print( nnpu.lower(s, [ a, b, c_host, sub_host, mul_host, rsub_host, div_host, rdiv_host, gtm_host ], simple_mode=True)) func = nnpu.build(s, [ a, b, c_host, sub_host, mul_host, rsub_host, div_host, rdiv_host, gtm_host ], 'nnpu', 'llvm', name='nnpu_exp') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(64, ), dtype=a.dtype, low=1, high=63) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=(1, ), dtype=b.dtype, low=2, high=31) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros((64, )).astype(c_host.dtype), ctx) sub_nd = tvm.nd.array(np.zeros((64, )).astype(sub_host.dtype), ctx) rsub_nd = tvm.nd.array(np.zeros((64, )).astype(rsub_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((64, )).astype(mul_host.dtype), ctx) div_nd = tvm.nd.array(np.zeros((64, )).astype(div_host.dtype), ctx) rdiv_nd = tvm.nd.array(np.zeros((64, )).astype(rdiv_host.dtype), ctx) gtm_nd = tvm.nd.array(np.zeros((64, )).astype(gtm_host.dtype), ctx) print('------------------- device module 1 llvm IR: ') print(func.imported_modules[0].get_source('ll')) print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) func(a_nd, b_nd, c_nd, sub_nd, mul_nd, rsub_nd, div_nd, rdiv_nd, gtm_nd) print('a = ') print(a_np) print('b = ') print(b_np) print('a + b =') print(c_nd.asnumpy()) print('numpy ground truth =') gt = a_np + b_np print(gt) np.testing.assert_allclose(c_nd.asnumpy(), gt) print('a - b =') print(sub_nd.asnumpy()) np.testing.assert_allclose(sub_nd.asnumpy(), a_np - b_np) print('b - a =') print(rsub_nd.asnumpy()) np.testing.assert_allclose(rsub_nd.asnumpy(), b_np - a_np) print('a * b =') print(mul_nd.asnumpy()) np.testing.assert_allclose(mul_nd.asnumpy(), a_np * b_np.astype(dtype_w)) print('a / b =') print(div_nd.asnumpy()) # numpy always round down, while in c, the numerator will be rounded to zero. #np.testing.assert_allclose(div_nd.asnumpy(), a_np / b_np) print('b / a =') print(rdiv_nd.asnumpy()) print('max(a, b)=') print(gtm_nd.asnumpy())
def test_inception_v3(): def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''): if pad[0] != 0 or pad[1] != 0: data = sym.pad(data=data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel, strides=stride, padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name="%s%s_conv2d" % (name, suffix)) bn = sym.batch_norm(data=conv, name="%s%s_batchnorm" % (name, suffix), epsilon=2e-5, axis=3) act = sym.relu(data=bn, name="%s%s_relu" % (name, suffix)) return act def Pooling(data, kernel, stride, pad, pool_type, name): if pad[0] != 0 or pad[1] != 0: data = sym.pad(data=data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) if pool_type == 'max': return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, name=name, layout='NHWC') if pool_type == 'avg': return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, name=name, layout='NHWC') raise ValueError("Invalid pooling type: " + pool_type) def Inception7A(data, num_1x1, num_3x3_red, num_3x3_1, num_3x3_2, num_5x5_red, num_5x5, pool, proj, name): # num_1x1 = 64 # num_3x3_red = 64 # num_3x3_1 = 96 # num_3x3_2 = 96 # num_5x5_red = 48 # num_5x5 = 64 tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name)) tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv') tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1') tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix="_conv") tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2') pooling = Pooling(data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv') concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7B(data, num_3x3, num_d3x3_red, num_d3x3_1, num_d3x3_2, pool, name): tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name)) tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv') tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1') tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2') pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max", name=('max_pool_%s_pool' % name)) concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7C(data, num_1x1, num_d7_red, num_d7_1, num_d7_2, num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4, pool, proj, name): tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv') tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1') tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2') tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4') pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7D(data, num_3x3_red, num_3x3, num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, pool, name): tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv') tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1') tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv') tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1') tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2') tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3') pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0), name=('%s_pool_%s_pool' % (pool, name))) concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7E(data, num_1x1, num_d3_red, num_d3_1, num_d3_2, num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, pool, proj, name): tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv') tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv') tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1') tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv') tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv') tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1') pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') concat = sym.concatenate(*[ tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj ], axis=3, name='ch_concat_%s_chconcat' % name) return concat def get_symbol(data, num_classes=16, **kwargs): conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv") conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1") conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2") pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), name="pool") conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3") conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4") pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), name="pool1") in3a = Inception7A(pool1, 64, 64, 96, 96, 48, 64, "avg", 32, "mixed") in3b = Inception7A(in3a, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_1") in3c = Inception7A(in3b, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_2") in3d = Inception7B(in3c, 384, 64, 96, 96, "max", "mixed_3") in4a = Inception7C(in3d, 192, 128, 128, 192, 128, 128, 128, 128, 192, "avg", 192, "mixed_4") in4b = Inception7C(in4a, 192, 160, 160, 192, 160, 160, 160, 160, 192, "avg", 192, "mixed_5") in4c = Inception7C(in4b, 192, 160, 160, 192, 160, 160, 160, 160, 192, "avg", 192, "mixed_6") in4d = Inception7C(in4c, 192, 192, 192, 192, 192, 192, 192, 192, 192, "avg", 192, "mixed_7") in4e = Inception7D(in4d, 192, 320, 192, 192, 192, 192, "max", "mixed_8") in5a = Inception7E(in4e, 320, 384, 384, 384, 448, 384, 384, 384, "avg", 192, "mixed_9") in5b = Inception7E(in5a, 320, 384, 384, 384, 448, 384, 384, 384, "max", 192, "mixed_10") pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0), name="global_pool") flatten = sym.flatten(data=pool, name="flatten") fc1 = sym.dense(data=flatten, units=num_classes, name="fc1") softmax = sym.softmax(data=fc1, name="softmax") return softmax input_shape = (1, 299, 299, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 3 z = get_symbol(data=data, num_classes=16) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=(1, 299, 299, 16), low=-32, high=32).astype(np.float32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)
def test(): env = nnpu.get_env() a = tvm.placeholder((16, ), env.cfg['dtype_w'], 'a') sph = ScheduleProcHelper() Imm = tvm.const(5, env.cfg['dtype_w']) a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) #c_buf = tvm.compute((16, ), lambda i: tvm.select(a_buf[i]>Imm,a_buf[i],Imm), 'c_buf') c_buf = tvm.compute((16, ), lambda i: Imm+a_buf[i], 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) sub_buf = tvm.compute((16, ), lambda i: a_buf[i] - Imm , 'sub_buf') sph.MarkScope(sub_buf) sub_host, sub_dram = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph) mul_buf = tvm.compute((16, ), lambda i: a_buf[i] * Imm, 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) div_buf = tvm.compute((16, ), lambda i: a_buf[i] / Imm, 'rdiv_buf') sph.MarkScope(div_buf) div_host, div_dram = nnpu.utils.CopyBufToH(div_buf, 'rdiv', sph) gtm_buf = tvm.compute((16, ), lambda i: tvm.max(a_buf[i], Imm), 'gtm_buf') sph.MarkScope(gtm_buf) gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph) rsub_buf = tvm.compute((16, ), lambda i: Imm-a_buf[i], 'rsub_buf') sph.MarkScope(rsub_buf) rsub_host, rsub_dram = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph) s = tvm.create_schedule([c_host.op, sub_host.op, mul_host.op, div_host.op, gtm_host.op,rsub_host.op]) sph.Transform(s) s[c_buf].tensorize(s[c_buf].op.axis[0], env.intrins.get('VAddI', imm_value=Imm.value,mode='w')) s[sub_buf].tensorize(s[sub_buf].op.axis[0], env.intrins.get('VSubI', imm_value=Imm.value,mode='w')) s[mul_buf].tensorize(s[mul_buf].op.axis[0], env.intrins.get('VMulI', imm_value=Imm.value,mode='w')) s[div_buf].tensorize(s[div_buf].op.axis[0], env.intrins.get('VDivI', imm_value=Imm.value,mode='w')) s[gtm_buf].tensorize(s[gtm_buf].op.axis[0], env.intrins.get('VGTMI', imm_value=Imm.value,mode='w')) s[rsub_buf].tensorize(s[rsub_buf].op.axis[0], env.intrins.get('ISubV', imm_value=Imm.value,mode='w')) print(nnpu.lower(s, [a,c_host,sub_host,mul_host,div_host,gtm_host,rsub_host], simple_mode=True)) func = nnpu.build(s, [a,c_host,sub_host,mul_host,div_host,gtm_host,rsub_host], 'nnpu', 'llvm', name='nnpu_vmuli') print('------------------- device module 1 llvm IR: ') print(func.imported_modules[0].get_source('ll')) print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(16, ), dtype=a.dtype, low = 3, high = 122) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) c_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx) sub_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx) div_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx) gtm_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx) rsub_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx) func(a_nd, c_nd, sub_nd, mul_nd, div_nd, gtm_nd,rsub_nd) print('a = ') print(a_nd.asnumpy()) print('a + {0} = '.format(Imm.value)) print(c_nd.asnumpy()) print('numpy ground truth =') gt = a_np + Imm.value print(gt) np.testing.assert_allclose(c_nd.asnumpy(), gt) print('a - {0} = '.format(Imm.value)) print(sub_nd.asnumpy()) np.testing.assert_allclose(sub_nd.asnumpy(), a_np - Imm.value) print('a * {0} = '.format(Imm.value)) print(mul_nd.asnumpy()) np.testing.assert_allclose(mul_nd.asnumpy(), a_np * Imm.value) print('a / {0} = '.format(Imm.value)) print(div_nd.asnumpy()) np.testing.assert_allclose(div_nd.asnumpy(), a_np / Imm.value) print('a > {0} ? a : {0} = '.format(Imm.value)) print(gtm_nd.asnumpy()) #np.testing.assert_allclose(gtm_nd.asnumpy(), a_np Imm.value) print('{0} - a = '.format(Imm.value)) print(rsub_nd.asnumpy()) np.testing.assert_allclose(rsub_nd.asnumpy(), Imm.value-a_np) print('test passed')
def test(): env = nnpu.get_env() nnpu.set_dump(False) #==================================# # ------ first define shapes ------ #==================================# # input data layout: HWC in_shape = (32, 32, 128) # pooling windows size, height == width. cell_shape = 4 # in this demo we don't do padding, so input data height and width must be divisible to pooling window size. assert in_shape[0] % cell_shape == 0, 'error' assert in_shape[1] % cell_shape == 0, 'error' nvctr_unit = env.cfg['vector_unit']['size'] assert in_shape[2] % nvctr_unit == 0, 'channel not divisible to vector unit size' out_shape = (in_shape[0] // cell_shape,in_shape[1] // cell_shape,in_shape[2]) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] sph = ScheduleProcHelper() str_op = 'VGTMMerge' #=================================================================# # ------ after all shapes defined, begin compute describing. ------ #=================================================================# a = tvm.placeholder(in_shape, dtype_w, 'a') # first copy to scratchpad. a_buf, _1 = nnpu.utils.CopyHtoBuf(a, 'a', sph) # stage 1, find the maximum pixel in every pooling window. # the extent of two reduction axes are sizes of pooling window. k1 = tvm.reduce_axis((0,cell_shape), 'k1') k2 = tvm.reduce_axis((0,cell_shape), 'k2') pooling_buf = tvm.compute(out_shape, lambda i,j,k: tvm.max(a_buf[i * cell_shape + k1, j * cell_shape + k2, k], axis=[k1, k2]), 'pooling_buf') sph.MarkScope(pooling_buf, 'buffer1') # copy back to host. step2_host, step2_dram = nnpu.utils.CopyBufToH(pooling_buf, 'pooling',sph) # ------ this ends the computation description. ------ #==================================# # ------ begin scheduling ------ #==================================# s = tvm.create_schedule(step2_host.op) sph.Transform(s) #tensorize i, j, k = pooling_buf.op.axis k1, k2 = pooling_buf.op.reduce_axis # split the reduce_axis by factor 1, to produce a dummy reduce axis. # this is a trick to enable tensorize, due to limitation of tvm's tensorize pattern matcher. ko, ki = s[pooling_buf].split(k2, factor=1) xo, xi = s[pooling_buf].split(k, factor=16) # reorder axes. # put xo right before ki to eliminate memory dependency between two consecutive VGTMV instruction s[pooling_buf].reorder( i, j, k1, ko, xo, ki, xi) s[pooling_buf].tensorize(ki, env.intrins.get(str_op, scope_out='buffer1', mode='w')) # unroll # s[pooling_buf].unroll(ko) # s[pooling_buf].unroll(xo) #==================================# # ------ this ends the scheduling ------ #==================================# print(nnpu.lower(s, [a, step2_host], simple_mode=True)) # exit() func = nnpu.build(s, [a, step2_host], 'nnpu', 'llvm', name='nnpu_func') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=in_shape, dtype=a.dtype, low = -128, high = 127) a_nd = tvm.nd.array(a_np, ctx) c_nd = tvm.nd.array(np.zeros(out_shape, dtype=step2_host.dtype), ctx) func(a_nd, c_nd) # print("pooling-max") # print(c_nd.asnumpy()) # print("nppooling-max") gt=max_pooling(in_shape,out_shape,cell_shape,a_np,a.dtype) # print(gt) np.testing.assert_allclose(c_nd.asnumpy(), gt) print('test passed')
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (2, 2, 16) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, 'a') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) k = tvm.reduce_axis((0, 2), 'k') add_buf = tvm.compute( (2, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'add_buf') sph.MarkScope(add_buf) add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph) k1 = tvm.reduce_axis((0, 2), 'k1') mul_buf = tvm.compute( (2, 16), lambda i, j: tvm.sum(a_buf[k1, i, j], axis=k1), 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) s = tvm.create_schedule([add_host.op, mul_host.op]) sph.Transform(s) ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1) s[add_buf].reorder(ko, ki, *(s[add_buf].op.axis)) s[add_buf].tensorize(ki, env.intrins.get('MAddMerge', shape=shape, mode='w')) ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1) s[mul_buf].reorder(ko1, ki1, *(s[mul_buf].op.axis)) s[mul_buf].tensorize(ki1, env.intrins.get('MMulMerge', shape=shape, mode='w')) print(nnpu.lower(s, [a, add_host, mul_host], simple_mode=True)) func = nnpu.build(s, [a, add_host, mul_host], 'nnpu', 'llvm', name='nnpu_func') #exit() ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(2, 2, 16), dtype=a.dtype, low=-16, high=16) a_nd = tvm.nd.array(a_np, ctx) add_nd = tvm.nd.array(np.zeros((2, 16)).astype(add_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((2, 16)).astype(mul_host.dtype), ctx) func(a_nd, add_nd, mul_nd) print('a = ') print(a_np) print('reduce sum row = ') print(add_nd.asnumpy()) print('ground truth is: ') gt = np.sum(a_np, axis=0) print(gt) np.testing.assert_allclose(add_nd.asnumpy(), gt) print('reduce mul row = ') print(mul_nd.asnumpy()) gt = np.multiply.reduce(a_np, axis=0, dtype=a.dtype) print(gt) np.testing.assert_allclose(mul_nd.asnumpy(), gt)
def test(): env = nnpu.get_env() shape = (4, 16) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, 'a') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) k = tvm.reduce_axis((0, 4), 'k') add_buf = tvm.compute((16, ), lambda i: tvm.sum(a_buf[k, i], axis=k), 'add_buf') sph.MarkScope(add_buf) add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph) # k1 = tvm.reduce_axis((0, 4), 'k1') # mul_buf = tvm.compute((16, ), lambda i: tvm.sum(a_buf[k1, i], axis=k1), 'mul_buf') # sph.MarkScope(mul_buf) # mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) k2 = tvm.reduce_axis((0, 4), 'k2') gtm_buf = tvm.compute((16, ), lambda i: tvm.max(a_buf[k2, i], axis=k2), 'gtm_buf') sph.MarkScope(gtm_buf) gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph) s = tvm.create_schedule([add_host.op, gtm_host.op]) sph.Transform(s) ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1) s[add_buf].reorder(ko, ki, s[add_buf].op.axis[0]) s[add_buf].tensorize(ki, env.intrins.get('VAddMerge', mode='w')) # ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1) # s[mul_buf].reorder(ko1, ki1, s[mul_buf].op.axis[0]) # s[mul_buf].tensorize(ki1, env.intrins.get('VMulMerge', mode='w')) ko2, ki2 = s[gtm_buf].split(gtm_buf.op.reduce_axis[0], factor=1) s[gtm_buf].reorder(ko2, ki2, s[gtm_buf].op.axis[0]) s[gtm_buf].tensorize(ki2, env.intrins.get('VGTMMerge', mode='w')) print(nnpu.lower(s, [a, add_host, gtm_host], simple_mode=True)) func = nnpu.build(s, [a, add_host, gtm_host], 'nnpu', 'llvm', name='nnpu_func') #exit() ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=-16, high=16) a_nd = tvm.nd.array(a_np, ctx) add_nd = tvm.nd.array(np.zeros((16, )).astype(add_host.dtype), ctx) # mul_nd = tvm.nd.array(np.zeros((16,)).astype(mul_host.dtype), ctx) gtm_nd = tvm.nd.array(np.zeros((16, )).astype(gtm_host.dtype), ctx) print('------------------- device module 1 IR code: ') print(func.imported_modules[0].get_source('ir')) func(a_nd, add_nd, gtm_nd) print('a = ') print(a_np) print('reduce sum row = ') print(add_nd.asnumpy()) print('ground truth is: ') gt = np.sum(a_np, axis=0) print(gt) np.testing.assert_allclose(add_nd.asnumpy(), gt) # print('reduce mul row = ') # print(mul_nd.asnumpy()) # gt = np.multiply.reduce(a_np ,axis=0,dtype = a.dtype) # print(gt) # np.testing.assert_allclose(mul_nd.asnumpy(), gt) print('reduce max row = ') print(gtm_nd.asnumpy()) gt = np.max(a_np, axis=0) print(gt) np.testing.assert_allclose(gtm_nd.asnumpy(), gt)
def test(): env = nnpu.get_env() shape = (16, 16) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') a = tvm.compute(shape, lambda *i: a_host(*i), name='a') a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf') vctr_shape = (1, 16) b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host') b = tvm.compute(vctr_shape, lambda *i: b_host(*i), name='b') b_buf = tvm.compute(vctr_shape, lambda *i: b(*i), name='b_buf') dtype_w = env.cfg['dtype_w'] mul_shape = (1, 16) k = tvm.reduce_axis((0, 16), 'k') c_buf = tvm.compute( mul_shape, lambda i, j: tvm.sum( b_buf[i, k].astype(dtype_w) * a_buf[j, k].astype(dtype_w), axis=k)) out_shape = (16, ) bias_host = tvm.placeholder(out_shape, env.cfg['dtype_w'], 'bias_host') bias = tvm.compute(out_shape, lambda *i: bias_host(*i), 'bias') bias_buf = tvm.compute(out_shape, lambda *i: bias(*i), 'bias_buf') #c = tvm.compute(out_shape, lambda *i: c_buf(*i), name='c') #c_host = tvm.compute(out_shape, lambda *i: c(*i), name='c_host') out_buf = tvm.compute(out_shape, lambda i: c_buf[0, i] + bias_buf[i], 'out_buf') out = tvm.compute(out_shape, lambda *i: out_buf(*i), 'out') out_host = tvm.compute(out_shape, lambda *i: out(*i), 'out_host') s = tvm.create_schedule(out_host.op) # mark variable scopes s[a].set_scope(env.dram_scope) s[b].set_scope(env.dram_scope) s[bias].set_scope(env.dram_scope) s[out].set_scope(env.dram_scope) s[a_buf].set_scope(env.uni_scratchpad_scope) s[b_buf].set_scope(env.uni_scratchpad_scope) s[c_buf].set_scope(env.uni_scratchpad_scope) s[bias_buf].set_scope(env.uni_scratchpad_scope) s[out_buf].set_scope(env.uni_scratchpad_scope) #print(dir(s[b].op.body)) # mark compiler pragmas s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma) s[b].pragma(s[b].op.axis[0], env.dma_copy_pragma) s[bias].pragma(s[bias].op.axis[0], env.dma_copy_pragma) s[out_host].pragma(s[out_host].op.axis[0], env.dma_copy_pragma) s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls) s[b_buf].pragma(s[b_buf].op.axis[0], env.scratchpad_ls) s[bias_buf].pragma(s[bias_buf].op.axis[0], env.scratchpad_ls) s[out].pragma(s[out].op.axis[0], env.scratchpad_ls) #s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0]) # tensorize #s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VEXP', mode='inc')) s[c_buf].tensorize(s[c_buf].op.axis[0], env.intrins.get('GEMM', shape=(1, 16, 16), mode='inc')) #outer, inner = out_buf.op.axis #s[out_buf].reorder(inner, outer) #print(outer) #print(tvm.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True)) s[out_buf].tensorize(s[out_buf].op.axis[0], env.intrins.get('VAddV', mode='w')) # build print(tvm.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True)) print( nnpu.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host, bias_host, out_host], 'nnpu', 'llvm', name='nnpu_exp') print('function built: ') #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=0, high=64) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=vctr_shape, dtype=b_host.dtype, low=0, high=64) #b_np = np.random.random(size=vctr_shape).astype(b_host.dtype) b_nd = tvm.nd.array(b_np, ctx) bias_np = np.random.randint(size=out_shape, dtype=bias_host.dtype, low=0, high=64) #bias_np = np.random.random(size=out_shape).astype(bias_host.dtype) bias_nd = tvm.nd.array(bias_np, ctx) out_nd = tvm.nd.array(np.zeros(out_shape).astype(out_host.dtype), ctx) # run func(a_nd, b_nd, bias_nd, out_nd) print('run finished') print('a=') print(a_np) print('b=') print(b_np) print('bias=') print(bias_np) print('out=') print(out_nd.asnumpy()) print('numpy ground truth is: ') gt = np.dot(b_np.astype(dtype_w), a_np.astype(dtype_w).transpose((1, 0))).reshape( (16, )) + bias_np print(gt) np.testing.assert_allclose(out_nd.asnumpy(), gt)
def test_ib(): print('aaaa') env = nnpu.get_env() nnpu.set_device(env) shape = (16, ) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, name='a') w = shape[0] e = 16 def build_nms_ir(ten_in, ten_out): ib = tvm.ir_builder.create() imm_value = 10 ib.scope_attr(env.nnpu_axis, "coproc_scope", 0) p_in = ib.buffer_ptr(ten_in[0]) p_out = ib.buffer_ptr(ten_out[0]) #with ib.for_range(0,w, name="k") as k: with ib.for_range(0, w / e, name="i") as i: ib.emit( make_intrin_call( "void", 'VAddI', ten_out[0].access_ptr("w", 'uint32') + i * dtype_bytes(dtype_w), ten_in[0].access_ptr("r", 'uint32') + i * dtype_bytes(dtype_w), tvm.const(imm_value, 'float64'), env.cfg['vector_unit']['size'], 3)) stmt = ib.get() return stmt sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) sph.MarkScope(a_buf) out = tvm.extern(a_buf.shape, [a_buf], build_nms_ir, in_buffers=[ tvm.decl_buffer(a_buf.shape, dtype_w, data_alignment=dtype_bytes(dtype_w), scope='local.nnpu_scratchpad0') ], out_buffers=[ tvm.decl_buffer(a_buf.shape, dtype_w, data_alignment=dtype_bytes(dtype_w), scope='local.nnpu_scratchpad0') ], dtype=dtype_w, name="test_ir") sph.MarkScope(out) out_host, out_dram = nnpu.utils.CopyBufToH(out, 'out', sph) s = tvm.create_schedule([out_host.op]) sph.Transform(s) print(tvm.lower(s, [a, out_host], simple_mode=True)) print(nnpu.lower(s, [a, out_host], simple_mode=True)) # exit(0) func = nnpu.build(s, [a, out_host], 'nnpu', 'llvm', name='nnpu_test') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(16, ), dtype=a.dtype, low=0, high=127) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(16, ).astype(out_host.dtype), ctx) func(a_nd, b_nd) print('a = ') print(a_np) print('xjb sum = ') print(b_nd.asnumpy()) return
def test_vgg(): def get_feature(internel_layer, layers, filters, batch_norm=False): """ Get VGG feature body as stacks of convoltions. layers : [1, 1, 2, 2, 2] filters : [64, 128, 256, 512, 512] """ for i, num in enumerate(layers): """ i = 0, num = 1 i = 1, num = 1 i = 2, num = 2 i = 3, num = 2 i = 4, num = 2 """ for j in range(num): internel_layer = sym.pad(data=internel_layer, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) internel_layer = sym.conv2d(data=internel_layer, kernel_size=(3, 3), channels=filters[i], layout='NHWC', kernel_layout='HWOI', name="conv%s_%s" % (i + 1, j + 1)) if batch_norm: internel_layer = sym.batch_norm(data=internel_layer, axis=3, name="bn%s_%s" % (i + 1, j + 1)) internel_layer = sym.relu(data=internel_layer, name="relu%s_%s" % (i + 1, j + 1)) internel_layer = sym.max_pool2d(data=internel_layer, pool_size=(2, 2), strides=(2, 2), layout="NHWC", name="pool%s" % (i + 1)) return internel_layer def get_classifier(input_data, num_classes): """ Get VGG classifier layers as fc layers. """ flatten = sym.flatten(data=input_data, name="flatten") fc1 = sym.dense(data=flatten, units=32, name="fc1") relu1 = sym.relu(data=fc1, name="relu1") drop1 = sym.dropout(data=relu1, rate=0.5, name="drop1") fc2 = sym.dense(data=drop1, units=32, name="fc2") relu2 = sym.relu(data=fc2, name="relu2") drop2 = sym.dropout(data=relu2, rate=0.5, name="drop2") fc3 = sym.dense(data=drop2, units=num_classes, name="fc3") return fc3 def get_symbol(datas, num_classes, num_layers=11, batch_norm=False): """ Parameters ------------ num_classes : int, default 16 Number of classification classes num_layers : int Number of layers for the variant of vgg. Options are 11, 13, 16, 19 batch_norm : bool, default False Use batch normalization. """ vgg_spec = { 11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]), 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]), 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512]) } if num_layers not in vgg_spec: raise ValueError( "Invalide num_layers {}. Choices are 11, 13, 16, 19.".format( num_layers)) layers, filters = vgg_spec[num_layers] feature = get_feature(datas, layers, filters, batch_norm) classifier = get_classifier(feature, num_classes) symbol = sym.softmax(data=classifier, name="softmax") return symbol input_shape = (1, 224, 224, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 1 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) print(compute_graph.ir()) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: nnpu.set_device(nnpu.get_env(), type='SC') with ScheduleProcHelper(): with nnpu.build_config(): deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=input_shape, low=-32, high=32).astype(np.float32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) # module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)