def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None): """Common part for building a configuration""" target, task, config = measure_input target, task.target_host = Target.check_and_update_host_consist( target, task.target_host) with target: s, args = task.instantiate(config) # check invalidity of template and code hash consistency if not config.valid(): raise InstantiationError(config.errors) opts = build_option or {} if check_gpu: # Add verify pass to filter out invalid configs in advance. opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))] # if target is vta, we need to use vta build if (hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta"): # pylint: disable=import-outside-toplevel import vta func = vta.build(s, args, target_host=task.target_host) else: with tvm.ir.transform.PassContext(config=opts): func = build(s, args, target_host=task.target_host, runtime=runtime) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
def _run(env, remote): # declare n = 21 m = 20 pad_before = [0, 1, 0, 0] pad_after = [1, 3, 0, 0] x = tvm.placeholder((n, m, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype) x_buf = topi.nn.pad(x, pad_before, pad_after, name="y") # insert no-op that won't be optimized away y_buf = tvm.compute( (n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT), lambda *i: x_buf(*i) >> 0, "y_buf") y = tvm.compute( (n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y") # schedule s = tvm.create_schedule(y.op) s[x_buf].set_scope(env.acc_scope) s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy) s[y_buf].set_scope(env.acc_scope) s[y_buf].pragma(y_buf.op.axis[0], env.alu) s[y].pragma(y.op.axis[0], env.dma_copy) # build with vta.build_config(): mod = vta.build(s, [x, y], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("padded_load.o")) remote.upload(temp.relpath("padded_load.o")) f = remote.load_module("padded_load.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint(1, 2, size=(n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = np.zeros((n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT)).astype(y.dtype) y_np[pad_before[0]:pad_before[0] + n, pad_before[1]:pad_before[1] + m, :] = x_np x_nd = tvm.nd.array(x_np, ctx) y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Padded load execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None): """Common part for building a configuration""" target, task, config = measure_input with target: s, args = task.instantiate(config) # check invalidity of template and code hash consistency if not config.valid(): raise InstantiationError(config.errors) opts = build_option or {} if check_gpu: # Add verify pass to filter out invalid configs in advance. opts["add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))] if cuda_arch: set_cuda_target_arch(cuda_arch) # if target is vta, we need to use vta build if hasattr(measure_input.target, 'device_name') and \ measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel import vta func = vta.build(s, args, target_host=task.target_host) else: with build_config(**opts): func = build(s, args, target_host=task.target_host) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
def verify(s): mod = vta.build(s, [x, w, y], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint( -128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint( -128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, ctx) w_nd = tvm.nd.array(w_np, ctx) y_nd = tvm.nd.array(y_np, ctx) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b,i,:] += np.dot(x_np[b,j,:].astype(env.acc_dtype), w_np[i,j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(y.dtype) if env.TARGET == "sim": simulator.clear_stats() f(x_nd, w_nd, y_nd) print(simulator.stats()) else: f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy())
def _run(env, remote): m = 8 n = 10 # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM max_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.max(a_buf(*i), 0), "res_buf") # relu min_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1), "max_buf") # relu res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: min_buf(*i).astype(env.inp_dtype), "min_buf") # SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[max_buf].set_scope(env.acc_scope) # SRAM s[min_buf].set_scope(env.acc_scope) # SRAM s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build with vta.build_config(): mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET == "tsim": simulator.tsim_init("libvta_hw") f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy()) if env.TARGET == "tsim": print("Relu test took {} clock cycles".format( simulator.tsim_cycles()))
def _run(env, remote): m = 2 n = 8 imm_shift = np.random.randint(0, 8) imm_scale = np.random.randint(1, 5) # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM res_shift = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a_buf(*i) + imm_shift, "res_shift") # compute res_scale = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_shift(*i) >> imm_scale, "res_scale") # compute res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_scale(*i).astype(env.inp_dtype), "res") # SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[res_shift].set_scope(env.acc_scope) # SRAM s[res_scale].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.right_shift((a_np + imm_shift), imm_scale) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET == "tsim": simulator.tsim_init("libvta_hw") f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy()) if env.TARGET == "tsim": print("Shift/scale test took {} clock cycles".format( simulator.tsim_cycles()))
def _run(env, remote): m = 8 n = 10 # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf" ) # DRAM->SRAM max_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(a_buf(*i), 0), "res_buf" ) # relu min_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1), "max_buf", ) # relu res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: min_buf(*i).astype(env.inp_dtype), "min_buf", ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[max_buf].set_scope(env.acc_scope) # SRAM s[min_buf].set_scope(env.acc_scope) # SRAM s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build with vta.build_config(): mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Relu execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _run(env, remote): m = 2 n = 8 imm_shift = np.random.randint(0, 8) imm_scale = np.random.randint(1, 5) # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf" ) # DRAM->SRAM res_shift = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a_buf(*i) + imm_shift, "res_shift" ) # compute res_scale = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_shift(*i) >> imm_scale, "res_scale" ) # compute res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_scale(*i).astype(env.inp_dtype), "res" ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[res_shift].set_scope(env.acc_scope) # SRAM s[res_scale].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.right_shift((a_np + imm_shift), imm_scale) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Shift and scale execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _run(env, remote): n = 6 x = tvm.placeholder( (n, n, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype) x_buf = tvm.compute( (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: x(*i), "x_buf") # insert no-op that won't be optimized away y_buf = tvm.compute( (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: x_buf(*i)>>0, "y_buf") y = tvm.compute( (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y") # schedule s = tvm.create_schedule(y.op) s[x_buf].set_scope(env.acc_scope) s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy) s[y_buf].set_scope(env.acc_scope) s[y_buf].pragma(y_buf.op.axis[0], env.alu) s[y].pragma(y.op.axis[0], env.dma_copy) # verification with vta.build_config(): m = vta.build(s, [x, y], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() m.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint( 1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = x_np.astype(y.dtype) x_nd = tvm.nd.array(x_np, ctx) y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Save load execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def verify(s, check_correctness): mod = vta.build(s, [data, kernel, bias, res], "ext_dev", env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.ext_dev(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() bias_orig = (np.random.uniform(size=(wl.out_filter, )) * 4).astype("int32") bias_orig = np.abs(bias_orig) data_packed = data_orig.reshape(batch_size // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose( (0, 2, 4, 5, 1, 3)) kernel_packed = kernel_orig.reshape(wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose( (0, 2, 4, 5, 1, 3)) bias_packed = bias_orig.reshape(1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_packed, ctx) kernel_arr = tvm.nd.array(kernel_packed, ctx) bias_arr = tvm.nd.array(bias_packed, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) res_unpack = res_arr.asnumpy().transpose( (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref += bias_orig.reshape(wl.out_filter, 1, 1) res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost
def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None): """Common part for building a configuration""" target, task, config = measure_input target, task.target_host = Target.canon_target_and_host( target, task.target_host) with target: s, args = task.instantiate(config) # check invalidity of template and code hash consistency if not config.valid(): raise InstantiationError(config.errors) # if target is vta, we need to use vta build if (hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta"): # pylint: disable=import-outside-toplevel import vta func = vta.build(s, args, target_host=task.target_host) else: current_pass_context: tvm.ir.transform.PassContext = ( tvm.ir.transform.PassContext.current()) current_config = dict(current_pass_context.config) if build_option is not None: current_config.update(build_option) if "tir.add_lower_pass" in current_config: current_add_lower_pass = list( current_config["tir.add_lower_pass"]) else: current_add_lower_pass = [] if check_gpu: current_add_lower_pass.append( (2, gpu_verify_pass(**check_gpu))) current_config["tir.add_lower_pass"] = current_add_lower_pass with tvm.ir.transform.PassContext( opt_level=current_pass_context.opt_level, required_pass=current_pass_context.required_pass, disabled_pass=current_pass_context.disabled_pass, instruments=current_pass_context.instruments, config=current_config, ): func = build(s, args, target_host=task.target_host, runtime=runtime) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
def verify(s, name=None): # Build with the CSE pass disabled as otherwise it would complicate the test with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): mod = vta.build( s, [x, w, y], tvm.target.Target("ext_dev", host=env.target_host)) temp = utils.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify dev = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, dev) w_nd = tvm.nd.array(w_np, dev) y_nd = tvm.nd.array(y_np, dev) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b, i, :] += np.dot( x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("GEMM schedule:{} execution statistics:".format(name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def verify(s, check_correctness): mod = vta.build(s, [data, kernel, bias, res], "ext_dev", env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.ext_dev(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") bias_orig = np.abs(bias_orig) data_packed = data_orig.reshape( batch_size//env.BATCH, env.BATCH, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) kernel_packed = kernel_orig.reshape( wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) bias_packed = bias_orig.reshape( 1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_packed, ctx) kernel_arr = tvm.nd.array(kernel_packed, ctx) bias_arr = tvm.nd.array(bias_packed, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) res_unpack = res_arr.asnumpy().transpose( (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref += bias_orig.reshape(wl.out_filter, 1, 1) res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost
def verify(s, name=None): mod = vta.build(s, [x, w, y], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, ctx) w_nd = tvm.nd.array(w_np, ctx) y_nd = tvm.nd.array(y_np, ctx) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b, i, :] += np.dot( x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("GEMM schedule:{} execution statistics:".format(name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm) # Let's take a look at the finalized schedule print(vta.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # This concludes the scheduling portion of this tutorial. ###################################################################### # TVM Compilation # --------------- # After we have finished specifying the schedule, we can compile it # into a TVM function. # Build GEMM VTA kernel my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm") # Write the compiled module into an object file. temp = util.tempdir() my_gemm.save(temp.relpath("gemm.o")) # Send the executable over RPC remote.upload(temp.relpath("gemm.o")) # Load the compiled module f = remote.load_module("gemm.o") ###################################################################### # Running the Function # -------------------- # The compiled TVM function uses a concise C API and can be invoked from
# Let's look at the final lowered TVM schedule after lowering memory # loads/stores down to DMA copy intrinsics, and the computation down to # VTA compute intrinsics. print(vta.lower(s, [data, weight, res], simple_mode=True)) ###################################################################### # TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # Compile the TVM module my_gemm = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="my_gemm") temp = util.tempdir() my_gemm.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and weight arrays randomly in the int range of (-128, 128] data_np = np.random.randint(-128, 128, size=(batch_size, in_channels)).astype(data.dtype) weight_np = np.random.randint(-128, 128, size=(out_channels, in_channels)).astype(weight.dtype)
def run_group_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" fcompute = topi.nn.group_conv2d_nchw fschedule = topi.generic.schedule_group_conv2d_nchw elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) fcompute = vta.top.group_conv2d_packed fschedule = vta.top.schedule_group_conv2d_packed # Derive shapes depending upon packing CI_G = wl.in_filter // wl.groups a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel) b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter // env.BLOCK_OUT, CI_G // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), wl.groups, env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * \ wl.out_filter * wl.in_filter // wl.groups def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) r_np = tvm.topi.testing.conv2d_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, wl.groups).astype(env.acc_dtype) return a_np, w_np, b_np, r_np # Data in original format data_np, kernel_np, bias_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose( (0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape(wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, CI_G // env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) bias_np = bias_np.reshape(wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") else: mod = tvm.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) bias_arr = tvm.nd.array(bias_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) bias_np = bias_np.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1) res_ref = res_ref >> env.WGT_WIDTH res_ref += bias_np res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s GROUP CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
# VTA compute intrinsics. print(vta.lower(s, [data, kernel, res], simple_mode=True)) ###################################################################### # TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # This library facilitates 2D convolution testing from tvm.topi.testing import conv2d_nchw_python # Compile the TVM module my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv") temp = util.tempdir() my_conv.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and kernel arrays randomly in the int range # of (-128, 128] in NCHW layout data_np = np.random.randint( -128, 128, size=(batch_size, in_channels, height, width)).astype(data.dtype) kernel_np = np.random.randint( -128, 128,
def run_pooling(env, remote, wl, target, check_correctness=True, print_ir=False, samples=10): # Workload assertions assert wl.hpad == wl.wpad pool_type = 'max' # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" #pooling_fcompute = topi.arm_cpu.pooling_nchw_spatial_pack pooling_fcompute = topi.nn.pool #pooling_fschedule = topi.arm_cpu.schedule_pooling_nchw_spatial_pack pooling_fschedule = topi.generic.schedule_pool elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) pooling_fcompute = vta.top.pooling_packed pooling_fschedule = vta.top.schedule_pooling_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) # output shape b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = topi.nn.pool(data, kernel=[3, 3], stride=[2, 2], padding=padding, pool_type=pool_type, layout="NCHW") # res = topi.right_shift(res, 8) # res = topi.add(res, bias) # res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) # res = topi.cast(res, env.out_dtype) # Derive base schedule s = pooling_fschedule([res], layout) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # get output shape _, oc, oh, ow = get_const_tuple(res.shape) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter # @memoize("vta.tests.test_benchmark_topi.pooling.verify_nchw") def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) pad_shape = (wl.batch, wl.in_filter, wl.height + wl.hpad * 2, wl.width + wl.wpad * 2) pad_np = np.zeros(shape=pad_shape).astype(data.dtype) no_zero = (range(wl.batch), range(wl.in_filter), (range(wl.hpad, wl.height + wl.hpad)), (range(wl.wpad, wl.width + wl.wpad))) pad_np[np.ix_(*no_zero)] = a_np b_shape = (wl.batch, oc, oh, ow) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) kw, kh = 3, 3 sw, sh = 2, 2 for i in range(oh): for j in range(ow): b_np[:, :, i, j] = np.max(pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw], axis=(2, 3)) b_np = np.maximum(b_np, 0.0) return a_np, pad_np, b_np # Data in original format data_np, _, res_ref = get_ref_data() # Build if "vta" in target.keys: mod = vta.build(s, [data, res], target=target, target_host=env.target_host, name="pooling") else: mod = tvm.build(s, [data, res], target=target, target_host=env.target_host, name="pooling") temp = util.tempdir() mod.save(temp.relpath("pooling.o")) remote.upload(temp.relpath("pooling.o")) f = remote.load_module("pooling.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("pooling", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, res_arr) print(cost) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() res_orig = np.maximum(res_orig, 0.0) res_ref = res_ref.astype(env.out_dtype) res_orig = res_orig.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s POOLING TEST %s: Time cost = %g sec/op" % (device, status, cost.mean)) return correct, cost, stats
# Let's look at the final lowered TVM schedule after lowering memory # loads/stores down to DMA copy intrinsics, and the computation down to # VTA compute intrinsics. print(vta.lower(s, [data, weight, res], simple_mode=True)) ###################################################################### # TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # Compile the TVM module my_gemm = vta.build( s, [data, weight, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_gemm" ) temp = utils.tempdir() my_gemm.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and weight arrays randomly in the int range of (-128, 128] data_np = np.random.randint(-128, 128, size=(batch_size, in_channels)).astype(data.dtype) weight_np = np.random.randint(-128, 128, size=(out_channels, in_channels)).astype(weight.dtype) # Apply packing to the data and weight arrays from a 2D to a 4D packed layout data_packed = data_np.reshape(
# This concludes the scheduling portion of this tutorial. ###################################################################### # TVM Compilation # --------------- # After we have finished specifying the schedule, we can compile it # into a TVM function. By default TVM compiles into a type-erased # function that can be directly called from python side. # # In the following line, we use :code:`tvm.build` to create a function. # The build function takes the schedule, the desired signature of the # function(including the inputs and outputs) as well as target language # we want to compile to. # my_vadd = vta.build(s, [A, B, C], tvm.target.Target("ext_dev", host=env.target_host), name="my_vadd") ###################################################################### # Saving the Module # ~~~~~~~~~~~~~~~~~ # TVM lets us save our module into a file so it can loaded back later. This # is called ahead-of-time compilation and allows us to save some compilation # time. # More importantly, this allows us to cross-compile the executable on our # development machine and send it over to the Pynq FPGA board over RPC for # execution. # Write the compiled module into an object file. temp = utils.tempdir() my_vadd.save(temp.relpath("vadd.o"))
def check_alu(tvm_op, np_op=None, use_imm=False): """Test ALU""" m = 8 n = 8 imm = np.random.randint(1, 5) # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") #DRAM->SRAM if use_imm: res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf") #compute else: b = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype) b_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf") #DRAM->SRAM res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), b_buf(*i)), "res_buf") #compute5B res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_buf(*i).astype(env.inp_dtype), "res") #SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_buf].set_scope(env.acc_scope) # SRAM s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM if not use_imm: s[b_buf].set_scope(env.acc_scope) # SRAM s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM if not remote: return # build with vta.build_config(): if use_imm: mod = vta.build(s, [a, res], "ext_dev", env.target_host) else: mod = vta.build(s, [a, b, res], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) else: b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET == "tsim": simulator.tsim_init("libvta_hw") if use_imm: f(a_nd, res_nd) else: b_nd = tvm.nd.array(b_np, ctx) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy())
def run_gemm( env, remote, target, batch_size, in_feat, out_feat, check_correctness=True, print_ir=True, samples=4, ): # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False elif "vta" in target.keys: data_pack = True # Derive shapes depending upon packing a_shape = (batch_size, in_feat) w_shape = (out_feat, in_feat) if data_pack: data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = ( out_feat // env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN, ) fcompute = vta.top.dense_packed fschedule = vta.top.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape fcompute = topi.x86.dense_nopack fschedule = topi.x86.schedule_dense_nopack data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: res = fcompute(data, kernel, None, env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops num_ops = 2 * batch_size * in_feat * out_feat # @memoize("vta.tests.test_benchmark_topi.dense.verify") def get_ref_data(): # derive min max for act, wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH, in_feat // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") else: mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") temp = utils.tempdir() mod.save(temp.relpath("dense.o")) remote.upload(temp.relpath("dense.o")) f = remote.load_module("dense.o") dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, dev) kernel_arr = tvm.nd.array(kernel_np, dev) res_arr = tvm.nd.array(res_np, dev) time_f = f.time_evaluator("dense", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.numpy() if data_pack: res_orig = res_orig.reshape(batch_size, out_feat) res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def run_conv2d_transpose( env, remote, wl, target, check_correctness=True, print_ir=False, samples=4 ): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" fcompute = topi.arm_cpu.conv2d_transpose_nchw fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) fcompute = vta.top.conv2d_transpose_packed fschedule = vta.top.schedule_conv2d_transpose_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel) if data_pack: data_shape = ( wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN, ) kernel_shape = ( wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN, ) else: data_shape = a_shape kernel_shape = w_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = fcompute( data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype, (wl.o_hpad, wl.o_wpad) ) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad num_ops = ( 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter ) # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for act and wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint( w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel) ).astype(kernel.dtype) r_np = tvm.topi.testing.conv2d_transpose_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, (wl.o_hpad, wl.o_wpad), ).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width, ).transpose((0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape( wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, wl.hkernel, wl.wkernel, ).transpose((2, 0, 4, 5, 3, 1)) kernel_np = np.flip(kernel_np, 2) kernel_np = np.flip(kernel_np, 3) # Build if "vta" in target.keys: with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): mod = vta.build( s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d_transpose", ) else: mod = tvm.build( s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d_transpose", ) temp = utils.tempdir() mod.save(temp.relpath("conv2d_transpose.o")) remote.upload(temp.relpath("conv2d_transpose.o")) f = remote.load_module("conv2d_transpose.o") dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, dev) kernel_arr = tvm.nd.array(kernel_np, dev) res_arr = tvm.nd.array(res_np, dev) time_f = f.time_evaluator("conv2d_transpose", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads(remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.numpy() if data_pack: res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape( wl.batch, wl.out_filter, fout_height, fout_width ) res_ref = res_ref >> env.WGT_WIDTH res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
###################################################################### # This concludes the scheduling portion of this tutorial. ###################################################################### # TVM Compilation # --------------- # After we have finished specifying the schedule, we can compile it # into a TVM function. By default TVM compiles into a type-erased # function that can be directly called from python side. # # In the following line, we use :code:`tvm.build` to create a function. # The build function takes the schedule, the desired signature of the # function(including the inputs and outputs) as well as target language # we want to compile to. # my_vadd = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_vadd") ###################################################################### # Saving the Module # ~~~~~~~~~~~~~~~~~ # TVM lets us save our module into a file so it can loaded back later. This # is called ahead-of-time compilation and allows us to save some compilation # time. # More importantly, this allows us to cross-compile the executable on our # development machine and send it over to the Pynq FPGA board over RPC for # execution. # Write the compiled module into an object file. temp = util.tempdir() my_vadd.save(temp.relpath("vadd.o"))
###################################################################### # TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # This library facilitates 2D convolution testing from tvm.topi.testing import conv2d_nchw_python # Compile the TVM module with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): my_conv = vta.build(s, [data, kernel, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_conv") temp = utils.tempdir() my_conv.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and kernel arrays randomly in the int range # of (-128, 128] in NCHW layout data_np = np.random.randint(-128, 128, size=(batch_size, in_channels, height, width)).astype(data.dtype)
# VTA compute intrinsics. print(vta.lower(s, [data, kernel, res], simple_mode=True)) ###################################################################### # TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # This library facilitates 2D convolution testing from topi.testing import conv2d_nchw_python # Compile the TVM module my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv") temp = util.tempdir() my_conv.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and kernel arrays randomly in the int range # of (-128, 128] in NCHW layout data_np = np.random.randint( -128, 128, size=(batch_size, in_channels, height, width)).astype(data.dtype) kernel_np = np.random.randint( -128, 128,
def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None): """Test ALU""" m = 8 n = 8 imm = np.random.randint(1, 5) # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf" ) # DRAM->SRAM if use_imm: res_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf" ) # compute else: b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype) b_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf" ) # DRAM->SRAM res_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), b_buf(*i)), "res_buf", ) # compute5B res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_buf(*i).astype(env.inp_dtype), "res", ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_buf].set_scope(env.acc_scope) # SRAM s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM if not use_imm: s[b_buf].set_scope(env.acc_scope) # SRAM s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM if not remote: return # build with vta.build_config(): if use_imm: mod = vta.build(s, [a, res], "ext_dev", env.target_host) else: mod = vta.build(s, [a, b, res], "ext_dev", env.target_host) temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) else: b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype( b.dtype ) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() if use_imm: f(a_nd, res_nd) else: b_nd = tvm.nd.array(b_np, dev) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("ALU {} execution statistics:".format(test_name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))