def test_bfloat_add_and_cast_2(): X = te.placeholder((3, ), name="X") Y = te.placeholder((3, ), name="Y") Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") + topi.cast(Y, dtype="custom[bfloat]16"), dtype="float") s = te.create_schedule([Z.op]) built_cast = lower_datatypes_and_build(s, [X, Y, Z]) ctx = tvm.context(tgt, 0) # Used float32 calculator at http://www.weitz.de/ieee/. Generated # unconstrained float32s for the operands and copied them in to x and y. # Then, to simulate float32->bfloat16 conversion implemented by the mybfloat # library, I cut off all but 7 bits of the mantissa. I then added the # numbers. To simulate bfloat16 add implemented in mybfloat, I cut off all # but 7 bits of the result's mantissa. I then copied that value into # z_expected. x = tvm.nd.array(np.array([1.2348297, -1.0298302E25, 1.2034023E-30]).astype("float32"), ctx=ctx) y = tvm.nd.array(np.array([-2.4992788, -9.888288E19, 9.342338E-29]).astype("float32"), ctx=ctx) z_expected = np.array([-1.25, -1.027587E25, 9.426888E-29]).astype("float32") z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx) built_cast(x, y, z) assert np.array_equal(z_expected, z.asnumpy())
def test_bfloat_add_and_cast_1(): X = te.placeholder((3, ), name="X") Y = te.placeholder((3, ), name="Y") Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") + topi.cast(Y, dtype="custom[bfloat]16"), dtype="float") s = te.create_schedule([Z.op]) built_cast = lower_datatypes_and_build(s, [X, Y, Z]) ctx = tvm.context(tgt, 0) # Used float32 calculator at http://www.weitz.de/ieee/. Generated float32s # with at most 7-bit mantissas which, when added, produce a result with at # most 7-bit mantissas. This is to ensure there are no errors due to # float32->bfloat16 conversions. x = tvm.nd.array(np.array([4.4103796E-32, 14942208.0, 1.78125]).astype("float32"), ctx=ctx) y = tvm.nd.array(np.array([-3.330669E-14, 19660800.0, 2.25]).astype("float32"), ctx=ctx) z_expected = np.array([-3.330669E-14, 34603008.0, 4.03125]).astype("float32") z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx) built_cast(x, y, z) assert np.array_equal(z_expected, z.asnumpy())
def get_promoted(op): a = te.placeholder((100, ), dtype='bfloat16') b = te.placeholder((100, ), dtype='bfloat16') c = te.compute((100, ), lambda i: topi.cast( op(topi.cast(a[i], 'float'), topi.cast(b[i], 'float')), 'bfloat16') ) s = te.create_schedule(c.op) func = tvm.driver.build_module.form_irmodule(s, [a, b, c], "main", None)["main"] return func.body
def get_promoted(op): a = te.placeholder((100, ), dtype="bfloat16") b = te.placeholder((100, ), dtype="bfloat16") c = te.compute( (100, ), lambda i: topi.cast( op(topi.cast(a[i], "float"), topi.cast(b[i], "float")), "bfloat16"), ) s = te.create_schedule(c.op) func = tvm.driver.build_module.schedule_to_module( s, [a, b, c], "main", None)["main"] return func.body
def check(t0, t1, factor): if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return # compute n = 128 A = te.placeholder((n, ), dtype=t0, name="A") B = te.placeholder((n, ), dtype=t1, name="B") C = te.compute((n, ), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C") # schedule s = tvm.te.create_schedule(C.op) ob, ib = s[C].split(s[C].op.axis[0], factor=factor) s[C].vectorize(ib) s[C].bind(ob, tx) func = tvm.build(s, [A, B, C], "cuda") # correctness dev = tvm.gpu(0) low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10) a_np = np.random.randint(low, high, size=n).astype(A.dtype) b_np = np.random.randint(low, high, size=n).astype(B.dtype) c_np = (a_np + b_np).astype(A.dtype) a_nd = tvm.nd.array(a_np, dev) b_nd = tvm.nd.array(b_np, dev) c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev) func(a_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=1e-3)
def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding, opadding): data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) with tvm.target.vta(): res = topi.nn.conv2d_transpose_nchw( Input=data, Filter=kernel, strides=strides, padding=padding, out_dtype=env.acc_dtype, output_padding=opadding, ) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) if tvm.target.Target.current().device_name == "vta": s = topi.generic.schedule_conv2d_transpose_nchw([res]) else: s = te.create_schedule([res.op]) return s, [data, kernel, res]
def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group): CI_G = CI // groups data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI_G // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) with tvm.target.vta(): res = topi.nn.group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) if tvm.target.Target.current().device_name == "vta": s = topi.generic.schedule_group_conv2d_nchw([res]) else: s = te.create_schedule([res.op]) return s, [data, kernel, bias, res]
def verify(from_dtype, to_dtype, low=-100, high=100): shape = (5, 4) A = te.placeholder(shape, dtype=from_dtype, name="A") B = topi.cast(A, to_dtype) if from_dtype == "bool": a_np = np.random.choice([True, False], size=shape) else: a_np = np.random.uniform(low, high, size=shape).astype(from_dtype) if to_dtype == "bool": a_np = a_np - a_np[2, 3] b_np = a_np.astype(to_dtype) for device in get_all_backend(): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) continue print("Running on target: %s" % device) with tvm.target.create(device): s = tvm.topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device) a = tvm.nd.array(a_np, ctx) b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np)
def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation): data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) with tvm.target.vta(): res = topi.nn.conv2d(input=data, filter=kernel, padding=padding, strides=strides, dilation=dilation, layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN), out_dtype=env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) if tvm.target.Target.current().device_name == 'vta': s = topi.generic.schedule_conv2d_nchw([res]) else: s = te.create_schedule([res.op]) return s, [data, kernel, bias, res]
def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor', align_corners=False, output_shape=None): """Perform upsampling on the data. Nearest neighbor and bilinear upsampling are supported. Parameters ---------- inputs : tvm.te.Tensor inputs is a 4-D tensor with shape [batch, channel, in_height, in_width] or [batch, in_height, in_width, channel] scale_h : float Scaling factor for height scale_w : float Scaling factor for width layout : string, optional either "NCHW" or "NHWC" method : {"bilinear", "nearest_neighbor", "bicubic"} Method to be used for upsampling. Returns ------- output : tvm.te.Tensor 4-D with shape [batch, channel, in_height*scale_h, in_width*scale_w] or [batch, in_height*scale, in_width*scale, channel] """ base_layout = layout[0:4] if base_layout == "NCHW": if not output_shape: #static case scaled_h = data.shape[2] * scale_h scaled_w = data.shape[3] * scale_w reshape_size = (simplify(topi.cast(te.round(scaled_h), data.shape[2].dtype)), simplify(topi.cast(te.round(scaled_w), data.shape[3].dtype))) else: #dynamic case -- we don't need to scale; already done in shape func reshape_size = (simplify(topi.cast(te.round(output_shape[2]), output_shape[2].dtype)), simplify(topi.cast(te.round(output_shape[3]), output_shape[3].dtype))) elif layout == "NHWC": if not output_shape: #static case scaled_h = data.shape[1] * scale_h scaled_w = data.shape[2] * scale_w reshape_size = (simplify(topi.cast(te.round(scaled_h), data.shape[1].dtype)), simplify(topi.cast(te.round(scaled_w), data.shape[2].dtype))) else: #dynamic case reshape_size = (simplify(topi.cast(te.round(output_shape[1]), output_shape[1].dtype)), simplify(topi.cast(te.round(output_shape[2]), output_shape[2].dtype))) else: raise ValueError("not support this layout {} yet".format(layout)) coord_trans = "align_corners" if align_corners else "asymmetric" return topi.image.resize(data, reshape_size, layout=layout, method=method, coordinate_transformation_mode=coord_trans, output_shape=output_shape)
def test_bfloat_add_and_cast_FloatImm(): X = te.placeholder((3, ), name="X") Z = topi.cast(topi.add(topi.cast(X, dtype="custom[bfloat]16"), tvm.tir.FloatImm("custom[bfloat]16", 1.5)), dtype="float") s = te.create_schedule([Z.op]) built_cast = lower_datatypes_and_build(s, [X, Z]) ctx = tvm.context(tgt, 0) x = tvm.nd.array(np.array([0.0, 1.0, 1.5]).astype("float32"), ctx=ctx) z_expected = np.array([1.5, 2.5, 3.0]).astype("float32") z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx) built_cast(x, z) assert np.array_equal(z_expected, z.asnumpy())
def to16(v): uint32_v = tvm.tir.call_intrin("uint32", "tir.reinterpret", v) rounding_bias = tvm.tir.call_intrin("uint32", "tir.shift_right", uint32_v, tvm.tir.const(16, "uint32")) rounding_bias = tvm.tir.call_intrin("uint32", "tir.bitwise_and", rounding_bias, tvm.tir.const(1, "uint32")) rounding_bias = rounding_bias + tvm.tir.const(0x7FFF, "uint16") uint32_v = uint32_v + rounding_bias uint32_v = tvm.tir.call_intrin("uint32", "tir.shift_right", uint32_v, tvm.tir.const(16, "uint32")) return topi.cast(uint32_v, "uint16")
def _topi_multiply(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, B = args[:2] with tvm.target.vta(): res = vta.top.op.multiply_packed(*args, **kwargs) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == "vta": s = vta.top.op.schedule_multiply_packed([res]) else: s = te.create_schedule([res.op]) return s, [A, B, res]
def test_cast(target, dev, cast_ref_data, from_dtype, to_dtype): m = te.var("m") l = te.var("l") A = te.placeholder((m, l), dtype=from_dtype, name="A") B = topi.cast(A, to_dtype) a_np, b_np = cast_ref_data with tvm.target.Target(target): s = tvm.topi.testing.get_injective_schedule(target)(B) foo = tvm.build(s, [A, B], target) a = tvm.nd.array(a_np, dev) b = tvm.nd.empty(b_np.shape, dtype=to_dtype, device=dev) foo(a, b) tvm.testing.assert_allclose(b.numpy(), b_np)
def _topi_nn_conv2d(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, W = args[:2] with tvm.target.vta(): res = vta.top.conv2d_packed(*args, **kwargs) res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == 'vta': s = vta.top.schedule_conv2d_packed([res]) else: s = te.create_schedule([res.op]) return s, [A, W, res]
def _topi_nn_conv2d(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" args = deserialize_args(args) A, W = args[:2] with tvm.target.vta(): res = topi.nn.conv2d(*args, **kwargs) res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == "vta": s = topi.generic.schedule_conv2d_nchw([res]) else: s = te.create_schedule([res.op]) return s, [A, W, res]
def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='nearest_neighbor', coordinate_transformation_mode="half_pixel"): """Perform upsampling on the data. Nearest neighbor and bilinear upsampling are supported. Parameters ---------- inputs : tvm.te.Tensor inputs is a 5-D tensor with shape [batch, channel, in_depth, in_height, in_width] or [batch, in_depth, in_height, in_width, channel] scale_d : float Scaling factor for depth scale_h : float Scaling factor for height scale_w : float Scaling factor for width layout : string, optional either "NCDHW" or "NDHWC" method : {"trilinear", "nearest_neighbor"} Method to be used for upsampling. coordinate_transformation_mode: string, optional Describes how to transform the coordinate in the resized tensor to the coordinate in the original tensor. Refer to the ONNX Resize operator specification for details. Available options are "half_pixel", "align_corners" and "asymmetric". Returns ------- output : tvm.te.Tensor 5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale] or [batch, in_depth*scale, in_height*scale, in_width*scale, channel] """ base_layout = layout[0:5] if base_layout == "NCDHW": out_shape = (simplify(topi.cast(te.round(data.shape[2] * scale_d), data.shape[2].dtype)), simplify(topi.cast(te.round(data.shape[3] * scale_h), data.shape[3].dtype)), simplify(topi.cast(te.round(data.shape[4] * scale_w), data.shape[4].dtype))) elif layout == "NDHWC": out_shape = (simplify(topi.cast(te.round(data.shape[1] * scale_d), data.shape[1].dtype)), simplify(topi.cast(te.round(data.shape[2] * scale_h), data.shape[2].dtype)), simplify(topi.cast(te.round(data.shape[3] * scale_w), data.shape[3].dtype))) else: raise ValueError("not support this layout {} yet".format(layout)) return topi.image.resize3d(data, out_shape, layout=layout, method=method, coordinate_transformation_mode=coordinate_transformation_mode)
def dense(N, CI, CO): data_shape = (N // env.BATCH, CI // env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) with tvm.target.vta(): res = topi.nn.dense(data, kernel, None, "int32") res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == "vta": s = topi.generic.schedule_dense([res]) else: s = te.create_schedule([res.op]) return s, [data, kernel, res]
def verify(from_dtype, to_dtype, low=-100, high=100): shape = (5, 4) A = te.placeholder(shape, dtype=from_dtype, name="A") B = topi.cast(A, to_dtype) if from_dtype == "bool": a_np = np.random.choice([True, False], size=shape) else: a_np = np.random.uniform(low, high, size=shape).astype(from_dtype) if to_dtype == "bool": a_np = a_np - a_np[2, 3] b_np = a_np.astype(to_dtype) for target, dev in tvm.testing.enabled_targets(): print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_injective_schedule(target)(B) foo = tvm.build(s, [A, B], target) a = tvm.nd.array(a_np, dev) b = tvm.nd.empty(shape=shape, dtype=to_dtype, device=dev) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np)
def to16(v): return topi.cast(v, "bfloat16")
def to32(v): return topi.cast(v, "float")
def run_conv2d_transpose( env, remote, wl, target, check_correctness=True, print_ir=False, samples=4 ): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" fcompute = topi.arm_cpu.conv2d_transpose_nchw fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) fcompute = vta.top.conv2d_transpose_packed fschedule = vta.top.schedule_conv2d_transpose_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel) if data_pack: data_shape = ( wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN, ) kernel_shape = ( wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN, ) else: data_shape = a_shape kernel_shape = w_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = fcompute( data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype, (wl.o_hpad, wl.o_wpad) ) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad num_ops = ( 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter ) # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for act and wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint( w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel) ).astype(kernel.dtype) r_np = tvm.topi.testing.conv2d_transpose_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, (wl.o_hpad, wl.o_wpad), ).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width, ).transpose((0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape( wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, wl.hkernel, wl.wkernel, ).transpose((2, 0, 4, 5, 3, 1)) kernel_np = np.flip(kernel_np, 2) kernel_np = np.flip(kernel_np, 3) # Build if "vta" in target.keys: with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): mod = vta.build( s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d_transpose", ) else: mod = tvm.build( s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d_transpose", ) temp = utils.tempdir() mod.save(temp.relpath("conv2d_transpose.o")) remote.upload(temp.relpath("conv2d_transpose.o")) f = remote.load_module("conv2d_transpose.o") dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, dev) kernel_arr = tvm.nd.array(kernel_np, dev) res_arr = tvm.nd.array(res_np, dev) time_f = f.time_evaluator("conv2d_transpose", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads(remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.numpy() if data_pack: res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape( wl.batch, wl.out_filter, fout_height, fout_width ) res_ref = res_ref >> env.WGT_WIDTH res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def to32(v): uint32_v = topi.cast(v, "uint32") uint32_v = tvm.tir.call_intrin("uint32", "tir.shift_left", uint32_v, tvm.tir.const(16, "uint32")) return tvm.tir.call_intrin("float32", "tir.reinterpret", uint32_v)
def to32(v): return topi.cast(v, 'float')
def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='nearest_neighbor', coordinate_transformation_mode="half_pixel", output_shape=None): """Perform upsampling on the data. Nearest neighbor and bilinear upsampling are supported. Parameters ---------- inputs : tvm.te.Tensor inputs is a 5-D tensor with shape [batch, channel, in_depth, in_height, in_width] or [batch, in_depth, in_height, in_width, channel] scale_d : float Scaling factor for depth scale_h : float Scaling factor for height scale_w : float Scaling factor for width layout : string, optional either "NCDHW" or "NDHWC" method : {"trilinear", "nearest_neighbor"} Method to be used for upsampling. coordinate_transformation_mode: string, optional Describes how to transform the coordinate in the resized tensor to the coordinate in the original tensor. Refer to the ONNX Resize operator specification for details. Available options are "half_pixel", "align_corners" and "asymmetric". output_shape: tvm.tir.container.Array, optional Shape to return. If left None will be inferred (If shape is determined dynamically, pass out_dtype.shape as output_shape) Returns ------- output : tvm.te.Tensor 5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale] or [batch, in_depth*scale, in_height*scale, in_width*scale, channel] """ base_layout = layout[0:5] if base_layout == "NCDHW": if not output_shape: # static case scaled_d = data.shape[2] * scale_d scaled_h = data.shape[3] * scale_h scaled_w = data.shape[4] * scale_w resize_shape = (simplify( topi.cast(te.round(scaled_d), data.shape[2].dtype)), simplify( topi.cast(te.round(scaled_h), data.shape[3].dtype)), simplify( topi.cast(te.round(scaled_w), data.shape[4].dtype))) else: # dynamic case -- don't need to scale; already done in shape func resize_shape = (simplify( topi.cast(te.round(output_shape[2]), data.shape[2].dtype)), simplify( topi.cast(te.round(output_shape[3]), data.shape[3].dtype)), simplify( topi.cast(te.round(output_shape[4]), data.shape[4].dtype))) elif layout == "NDHWC": if not output_shape: # static case scaled_d = data.shape[1] * scale_d scaled_h = data.shape[2] * scale_h scaled_w = data.shape[3] * scale_w resize_shape = (simplify( topi.cast(te.round(scaled_d), data.shape[1].dtype)), simplify( topi.cast(te.round(scaled_h), data.shape[2].dtype)), simplify( topi.cast(te.round(scaled_w), data.shape[3].dtype))) else: # dynamic case resize_shape = (simplify( topi.cast(te.round(output_shape[1]), data.shape[1].dtype)), simplify( topi.cast(te.round(output_shape[2]), data.shape[2].dtype)), simplify( topi.cast(te.round(output_shape[3]), data.shape[3].dtype))) else: raise ValueError("not support this layout {} yet".format(layout)) return topi.image.resize3d( data, resize_shape, layout=layout, method=method, coordinate_transformation_mode=coordinate_transformation_mode)
def to16(v): return topi.cast(v, 'bfloat16')
def run_group_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" fcompute = topi.nn.group_conv2d_nchw fschedule = topi.generic.schedule_group_conv2d_nchw elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) fcompute = vta.top.group_conv2d_packed fschedule = vta.top.schedule_group_conv2d_packed # Derive shapes depending upon packing CI_G = wl.in_filter // wl.groups a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel) b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter // env.BLOCK_OUT, CI_G // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), wl.groups, env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * \ wl.out_filter * wl.in_filter // wl.groups def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) r_np = tvm.topi.testing.conv2d_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, wl.groups).astype(env.acc_dtype) return a_np, w_np, b_np, r_np # Data in original format data_np, kernel_np, bias_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose( (0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape(wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, CI_G // env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) bias_np = bias_np.reshape(wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") else: mod = tvm.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) bias_arr = tvm.nd.array(bias_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) bias_np = bias_np.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1) res_ref = res_ref >> env.WGT_WIDTH res_ref += bias_np res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s GROUP CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def run_gemm( env, remote, target, batch_size, in_feat, out_feat, check_correctness=True, print_ir=True, samples=4, ): # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False elif "vta" in target.keys: data_pack = True # Derive shapes depending upon packing a_shape = (batch_size, in_feat) w_shape = (out_feat, in_feat) if data_pack: data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = ( out_feat // env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN, ) fcompute = vta.top.dense_packed fschedule = vta.top.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape fcompute = topi.x86.dense_nopack fschedule = topi.x86.schedule_dense_nopack data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: res = fcompute(data, kernel, None, env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops num_ops = 2 * batch_size * in_feat * out_feat # @memoize("vta.tests.test_benchmark_topi.dense.verify") def get_ref_data(): # derive min max for act, wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH, in_feat // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") else: mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") temp = utils.tempdir() mod.save(temp.relpath("dense.o")) remote.upload(temp.relpath("dense.o")) f = remote.load_module("dense.o") dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, dev) kernel_arr = tvm.nd.array(kernel_np, dev) res_arr = tvm.nd.array(res_np, dev) time_f = f.time_evaluator("dense", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.numpy() if data_pack: res_orig = res_orig.reshape(batch_size, out_feat) res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats