def test_bfloat_add_and_cast_1(): X = te.placeholder((3, ), name="X") Y = te.placeholder((3, ), name="Y") Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") + topi.cast(Y, dtype="custom[bfloat]16"), dtype="float") s = te.create_schedule([Z.op]) built_cast = lower_datatypes_and_build(s, [X, Y, Z]) ctx = tvm.context(tgt, 0) # Used float32 calculator at http://www.weitz.de/ieee/. Generated float32s # with at most 7-bit mantissas which, when added, produce a result with at # most 7-bit mantissas. This is to ensure there are no errors due to # float32->bfloat16 conversions. x = tvm.nd.array(np.array([4.4103796E-32, 14942208.0, 1.78125]).astype("float32"), ctx=ctx) y = tvm.nd.array(np.array([-3.330669E-14, 19660800.0, 2.25]).astype("float32"), ctx=ctx) z_expected = np.array([-3.330669E-14, 34603008.0, 4.03125]).astype("float32") z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx) built_cast(x, y, z) assert np.array_equal(z_expected, z.asnumpy())
def test_bfloat_add_and_cast_2(): X = te.placeholder((3, ), name="X") Y = te.placeholder((3, ), name="Y") Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") + topi.cast(Y, dtype="custom[bfloat]16"), dtype="float") s = te.create_schedule([Z.op]) built_cast = lower_datatypes_and_build(s, [X, Y, Z]) ctx = tvm.context(tgt, 0) # Used float32 calculator at http://www.weitz.de/ieee/. Generated # unconstrained float32s for the operands and copied them in to x and y. # Then, to simulate float32->bfloat16 conversion implemented by the mybfloat # library, I cut off all but 7 bits of the mantissa. I then added the # numbers. To simulate bfloat16 add implemented in mybfloat, I cut off all # but 7 bits of the result's mantissa. I then copied that value into # z_expected. x = tvm.nd.array(np.array([1.2348297, -1.0298302E25, 1.2034023E-30]).astype("float32"), ctx=ctx) y = tvm.nd.array(np.array([-2.4992788, -9.888288E19, 9.342338E-29]).astype("float32"), ctx=ctx) z_expected = np.array([-1.25, -1.027587E25, 9.426888E-29]).astype("float32") z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx) built_cast(x, y, z) assert np.array_equal(z_expected, z.asnumpy())
def _transform(theta, input_dim, out_size, input_shape, dtype): num_batch = input_shape[0] height = input_shape[1] width = input_shape[2] num_channels = input_shape[3] theta = topi.reshape(theta, (num_batch, 2, 3)) theta = topi.cast(theta, dtype) out_height = out_size[0] out_width = out_size[1] grid = _meshgrid(out_height, out_width) grid = topi.reshape(grid, (num_batch, 3, out_height*out_width)) grid = topi.cast(grid, dtype=dtype) k = tvm.reduce_axis((0, 3), 'k') T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g') x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s') y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s') x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,)) y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,)) input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype) output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels]) return output
def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor', align_corners=False): """Perform upsampling on the data. Nearest neighbor and bilinear upsampling are supported. Parameters ---------- inputs : tvm.Tensor inputs is a 4-D tensor with shape [batch, channel, in_height, in_width] or [batch, in_height, in_width, channel] scale_h : float Scaling factor for height scale_w : float Scaling factor for width layout : string, optional either "NCHW" or "NHWC" method : {"bilinear", "nearest_neighbor", "bicubic"} Method to be used for upsampling. Returns ------- output : tvm.Tensor 4-D with shape [batch, channel, in_height*scale_h, in_width*scale_w] or [batch, in_height*scale, in_width*scale, channel] """ base_layout = layout[0:4] if base_layout == "NCHW": out_shape = (simplify( topi.cast(tvm.round(data.shape[2] * scale_h), data.shape[2].dtype)), simplify( topi.cast(tvm.round(data.shape[3] * scale_w), data.shape[3].dtype))) elif layout == "NHWC": out_shape = (simplify( topi.cast(tvm.round(data.shape[1] * scale_h), data.shape[1].dtype)), simplify( topi.cast(tvm.round(data.shape[2] * scale_w), data.shape[2].dtype))) else: raise ValueError("not support this layout {} yet".format(layout)) coord_trans = "align_corners" if align_corners else "asymmetric" return topi.image.resize(data, out_shape, layout=layout, method=method, coordinate_transformation_mode=coord_trans)
def get_promoted(op): a = te.placeholder((100, ), dtype='bfloat16') b = te.placeholder((100, ), dtype='bfloat16') c = te.compute((100, ), lambda i: topi.cast( op(topi.cast(a[i], 'float'), topi.cast(b[i], 'float')), 'bfloat16') ) s = te.create_schedule(c.op) func = tvm.driver.build_module.form_irmodule(s, [a, b, c], "main", None)["main"] return func.body
def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation): data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) with tvm.target.vta(): res = topi.nn.conv2d(input=data, filter=kernel, padding=padding, strides=strides, dilation=dilation, layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN), out_dtype=env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) if tvm.target.Target.current().device_name == 'vta': s = topi.generic.schedule_conv2d_nchw([res]) else: s = tvm.create_schedule([res.op]) return s, [data, kernel, bias, res]
def verify(from_dtype, to_dtype, low=-100, high=100): shape = (5, 4) A = tvm.placeholder(shape, dtype=from_dtype, name="A") B = topi.cast(A, to_dtype) if from_dtype == "bool": a_np = np.random.choice([True, False], size=shape) else: a_np = np.random.uniform(low, high, size=shape).astype(from_dtype) if to_dtype == "bool": a_np = a_np - a_np[2, 3] b_np = a_np.astype(to_dtype) for device in get_all_backend(): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) continue print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective(B) foo = tvm.build(s, [A, B], device) a = tvm.nd.array(a_np, ctx) b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np)
def check(t0, t1): if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return # compute n = 128 A = te.placeholder((n, ), dtype=t0, name='A') B = te.placeholder((n, ), dtype=t1, name='B') C = te.compute((n, ), lambda i: A[i] + topi.cast(B[i], A.dtype), name='C') # schedule s = tvm.te.create_schedule(C.op) ob, ib = s[C].split(s[C].op.axis[0], nparts=32) _, iib = s[C].split(ib, factor=4) s[C].vectorize(iib) s[C].bind(ob, tx) func = tvm.build(s, [A, B, C], "cuda") # correctness ctx = tvm.gpu(0) low, high = (0, 20) if t0.startswith('u') or t1.startswith('u') else (-10, 10) a_np = np.random.randint(low, high, size=n).astype(A.dtype) b_np = np.random.randint(low, high, size=n).astype(B.dtype) c_np = (a_np + b_np).astype(A.dtype) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(b_np, ctx) c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), ctx) func(a_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=1e-3)
def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group): CI_G = CI // groups data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI_G // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) with tvm.target.vta(): res = topi.nn.group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) if tvm.target.Target.current().device_name == 'vta': s = topi.generic.schedule_group_conv2d_nchw([res]) else: s = te.create_schedule([res.op]) return s, [data, kernel, bias, res]
def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding): data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) with tvm.target.vta(): res = topi.nn.conv2d_transpose_nchw(Input=data, Filter=kernel, strides=strides, padding=padding, out_dtype=env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) if tvm.target.current_target().device_name == 'vta': s = topi.generic.schedule_conv2d_transpose_nchw([res]) else: s = tvm.create_schedule([res.op]) return s, [data, kernel, res]
def Cast(device="llvm", lib_path="./", ndim=None, src_dtype=None, dst_dtype=None): ''' cast Args: device: lib_path: ndim: src_dtype: dst_dtype: Returns: ''' shape = [tvm.var("n" + str(i)) for i in range(ndim)] opname = "Cast_ndim%d_%s_%s" % (ndim, src_dtype, dst_dtype) print(opname) # define compute in_tensor = tvm.placeholder(shape, dtype=src_dtype, name='in_tensor') out_tensor = topi.cast(in_tensor, dst_dtype) tensor_list = [in_tensor, out_tensor] s = topi.generic.schedule_injective(out_tensor) Genlib(s, tensor_list, device, opname, lib_path)
def test_bfloat_add_and_cast_FloatImm(): X = te.placeholder((3, ), name="X") Z = topi.cast(topi.add(topi.cast(X, dtype="custom[bfloat]16"), tvm.tir.FloatImm("custom[bfloat]16", 1.5)), dtype="float") s = te.create_schedule([Z.op]) built_cast = lower_datatypes_and_build(s, [X, Z]) ctx = tvm.context(tgt, 0) x = tvm.nd.array(np.array([0.0, 1.0, 1.5]).astype("float32"), ctx=ctx) z_expected = np.array([1.5, 2.5, 3.0]).astype("float32") z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx) built_cast(x, z) assert np.array_equal(z_expected, z.asnumpy())
def _compute_offset(in_tensor, in_shape, out_shape, attr_list, nz_format_flag): """ the compute of scale Parameters ---------- in_tensor : input tensor in_shape : the shape of input tensor out_shape :the shape of output tensor attr_list : the attr list nz_format_flag: the format of input tensor Returns ------- res tensor """ offset = attr_list[0] reform_flag = attr_list[1] scale = attr_list[2] if offset != 0 or scale == 1: offset_value = tvm.const(offset, "float16") if reform_flag: offset_ub = _reform_by_vadds(in_tensor, in_shape, out_shape, offset_value, nz_format_flag) else: offset_ub = tvm.compute( out_shape, lambda *indice: in_tensor(*indice) + offset_value, name="offset_ub") cast_i8_ub = tvm.compute( out_shape, lambda *indice: topi.cast(offset_ub(*indice), "int8"), name='cast_i8_ub') else: cast_i8_ub = tvm.compute( out_shape, lambda *indice: topi.cast(in_tensor(*indice), "int8"), name='cast_i8_ub') return cast_i8_ub
def to16(v): uint32_v = tvm.tir.call_intrin("uint32", "tir.reinterpret", v) rounding_bias = tvm.tir.call_intrin("uint32", "tir.shift_right", uint32_v, tvm.tir.const(16, "uint32")) rounding_bias = tvm.tir.call_intrin("uint32", "tir.bitwise_and", rounding_bias, tvm.tir.const(1, "uint32")) rounding_bias = rounding_bias + tvm.tir.const(0x7FFF, "uint16") uint32_v = uint32_v + rounding_bias uint32_v = tvm.tir.call_intrin("uint32", "tir.shift_right", uint32_v, tvm.tir.const(16, "uint32")) return topi.cast(uint32_v, 'uint16')
def pick_single_out(outs): type_level = {"bool": 1, "int8": 2, "int32": 3, "float16": 4, "float32": 5} outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs ori_out = outs fake_out = False fork_node = [] if len(outs) > 1: outs, fork_node = check_multi_out(outs) if len(outs) > 1: fake_op = outs[0] highest_type = outs[0].dtype for node in outs[1:]: if node.dtype != highest_type: if type_level[highest_type] > type_level[node.dtype]: node = topi.cast(node, highest_type) else: highest_type = node.dtype fake_op = topi.cast(fake_op, highest_type) fake_op = topi.add(node, fake_op) fake_out = True outs = [fake_op] tmp_out = [op for op in ori_out if op not in outs] return outs, tmp_out, fake_out, fork_node
def _topi_nn_conv2d(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, W = args[:2] with tvm.target.vta(): res = vta.top.conv2d_packed(*args, **kwargs) res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == 'vta': s = vta.top.schedule_conv2d_packed([res]) else: s = te.create_schedule([res.op]) return s, [A, W, res]
def _topi_nn_conv2d(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" args = deserialize_args(args) A, W = args[:2] with tvm.target.vta(): res = topi.nn.conv2d(*args, **kwargs) res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.current_target().device_name == 'vta': s = topi.generic.schedule_conv2d_nchw([res]) else: s = tvm.create_schedule([res.op]) return s, [A, W, res]
def dense(N, CI, CO): data_shape = (N//env.BATCH, CI//env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) with tvm.target.vta(): res = topi.nn.dense(data, kernel, None, 'int32') res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == 'vta': s = topi.generic.schedule_dense([res]) else: s = tvm.create_schedule([res.op]) return s, [data, kernel, res]
def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True): data_shape = (batch_size, wl.in_filter, wl.height, wl.width) kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) res_conv = topi.nn.conv2d(data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride), out_dtype="int32") res = topi.right_shift(res_conv, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") # To compute number of ops, use a x2 factor for FMA num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter a_shape = (batch_size, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) stride = (wl.hstride, wl.wstride) data_dtype = data.dtype kernel_dtype = kernel.dtype acc_dtype = env.acc_dtype assert wl.hpad == wl.wpad padding = wl.hpad @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc") def get_ref_data(): a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) a_np = np.abs(a_np) w_np = np.abs(w_np) b_np = topi.testing.conv2d_nchw_python(a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) return a_np, w_np, b_np def verify(s, check_correctness): mod = tvm.build(s, [data, kernel, res], target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.cpu(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_orig, ctx) kernel_arr = tvm.nd.array(kernel_orig, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, res_arr) res_unpack = res_arr.asnumpy() if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost def conv_normal(print_ir): print("----- CONV2D CPU End-to-End Test-------") s = topi.generic.schedule_conv2d_nchw([res]) if print_ir: print(tvm.lower(s, [data, kernel, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10**9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) conv_normal(False)
def to32(v): uint32_v = topi.cast(v, "uint32") uint32_v = tvm.tir.call_pure_intrin("uint32", "tir.shift_left", uint32_v, tvm.tir.const(16, "uint32")) return tvm.tir.call_pure_intrin("float32", "tir.reinterpret", uint32_v)
def to32(v): return topi.cast(v, 'float')
def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) # Define base computation schedule with target: res = topi.nn.conv2d( data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), layout, env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = topi.generic.schedule_conv2d_nchw([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) r_np = topi.testing.conv2d_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype) return a_np, w_np, b_np, r_np # Data in original format data_np, kernel_np, bias_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( wl.batch//env.BATCH, env.BATCH, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape( wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) bias_np = bias_np.reshape( wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") else: mod = tvm.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) bias_arr = tvm.nd.array(bias_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) if env.TARGET == "sim": stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads(remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) bias_np = bias_np.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1) res_ref = res_ref >> env.WGT_WIDTH res_ref += bias_np res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10 ** 9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def to16(v): return topi.cast(v, 'bfloat16')
def compute_cast(attrs, inputs, _): """Compute definition of cast""" dtype = attrs.get_string("dtype") return topi.cast(inputs[0], dtype)
def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='nearest_neighbor', coordinate_transformation_mode="half_pixel"): """Perform upsampling on the data. Nearest neighbor and bilinear upsampling are supported. Parameters ---------- inputs : tvm.te.Tensor inputs is a 5-D tensor with shape [batch, channel, in_depth, in_height, in_width] or [batch, in_depth, in_height, in_width, channel] scale_d : float Scaling factor for depth scale_h : float Scaling factor for height scale_w : float Scaling factor for width layout : string, optional either "NCDHW" or "NDHWC" method : {"trilinear", "nearest_neighbor"} Method to be used for upsampling. coordinate_transformation_mode: string, optional Describes how to transform the coordinate in the resized tensor to the coordinate in the original tensor. Refer to the ONNX Resize operator specification for details. Available options are "half_pixel", "align_corners" and "asymmetric". Returns ------- output : tvm.te.Tensor 5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale] or [batch, in_depth*scale, in_height*scale, in_width*scale, channel] """ base_layout = layout[0:5] if base_layout == "NCDHW": out_shape = (simplify( topi.cast(te.round(data.shape[2] * scale_d), data.shape[2].dtype)), simplify( topi.cast(te.round(data.shape[3] * scale_h), data.shape[3].dtype)), simplify( topi.cast(te.round(data.shape[4] * scale_w), data.shape[4].dtype))) elif layout == "NDHWC": out_shape = (simplify( topi.cast(te.round(data.shape[1] * scale_d), data.shape[1].dtype)), simplify( topi.cast(te.round(data.shape[2] * scale_h), data.shape[2].dtype)), simplify( topi.cast(te.round(data.shape[3] * scale_w), data.shape[3].dtype))) else: raise ValueError("not support this layout {} yet".format(layout)) return topi.image.resize3d( data, out_shape, layout=layout, method=method, coordinate_transformation_mode=coordinate_transformation_mode)
def _interpolate(im, im_shape, x, y, out_size, dtype): num_batch = im_shape[0] height = im_shape[1] width = im_shape[2] channels = im_shape[3] out_height = out_size[0] out_width = out_size[1] max_y = int(im_shape[1] - 1) max_x = int(im_shape[2] - 1) #[-1,1] -> [0, width-1] x = topi.multiply(topi.add(x, tvm.const(1, dtype=dtype)), width / tvm.const(2, dtype=dtype)) y = topi.multiply(topi.add(y, tvm.const(1, dtype=dtype)), height / tvm.const(2, dtype=dtype)) # do sampling dim3 = out_height * out_width * num_batch x0 = topi.cast(topi.floor(x), 'int32') y0 = topi.cast(topi.floor(y), 'int32') x1 = topi.add(x0,tvm.const(1, dtype="int32")) y1 = topi.add(y0,tvm.const(1, dtype="int32")) x0 = topi.clip(x0, 0, max_x) x1 = topi.clip(x1, 0, max_x) y0 = topi.clip(y0, 0, max_y) y1 = topi.clip(y1, 0, max_y) dim2 = width dim1 = width * height base = tvm.compute((dim3,),lambda i:(i // (out_height * out_width)) * width * height, name = 'base') base_y0 = topi.add(base, topi.multiply(y0, dim2)) base_y1 = topi.add(base, topi.multiply(y1, dim2)) idx_a = topi.add(base_y0, x0) idx_b = topi.add(base_y1, x0) idx_c = topi.add(base_y0, x1) idx_d = topi.add(base_y1, x1) im_flat = topi.reshape(im, (num_batch * height * width, channels)) im_flat = topi.cast(im_flat, dtype) Ia = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_a[i], j], name = 'Ia') Ib = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_b[i], j], name = 'Ib') Ic = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_c[i], j], name = 'Ic') Id = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_d[i], j], name = 'Id') x0_f = topi.cast(x0, dtype) x1_f = topi.cast(x1, dtype) y0_f = topi.cast(y0, dtype) y1_f = topi.cast(y1, dtype) wa = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y1_f, y)), 1) wb = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y, y0_f)), 1) wc = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y1_f, y)), 1) wd = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y, y0_f)), 1) output = topi.add(topi.add(topi.add(topi.multiply(wa, Ia), topi.multiply(wb, Ib)),topi.multiply(wc, Ic)), topi.multiply(wd, Id)) return output
# the op to a Call to a function of the provided name, e.g. BFloat16Add_wrapper. tvm.datatype.register_op(tvm.datatype.create_lower_func("FloatToBFloat16_wrapper"), "Cast", target, "bfloat", "float") tvm.datatype.register_op(tvm.datatype.create_lower_func("BFloat16ToFloat_wrapper"), "Cast", target, "float", "bfloat") tvm.datatype.register_op(tvm.datatype.create_lower_func("BFloat16Add_wrapper"), "Add", target, "bfloat") # The basic program, but with casts to a custom datatype. # Note how we specify the custom datatype: we indicate it using the special # `custom[...]` syntax. # Additionally, note the "16" after the datatype: this is the bitwidth of the # custom datatype. This tells TVM that each instance of bfloat is 16 bits wide. Z = topi.cast( topi.cast(X, dtype="custom[bfloat]16") + topi.cast(Y, dtype="custom[bfloat]16"), dtype="float32") # Compile for LLVM (schedule, lower, and build) schedule = tvm.create_schedule([Z.op]) lowered_func = tvm.lower(schedule, [X, Y, Z]) # Here, we manually lower custom datatypes. Soon, this will be incorporated # directly into the TVM lower and build process. lowered_func = tvm.ir_pass.LowerCustomDatatypes(lowered_func, target) built_program = tvm.build(lowered_func, target=target) # Finally, create a new array to hold the output and run the program. z_bfloat = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=context) built_program(x, y, z_bfloat)
def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True): data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (1, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) res_conv = vta.top.packed_conv2d( data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride)) res = topi.right_shift(res_conv, 8) res = topi.add(res, bias) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") # To compute number of ops, use a x2 factor for FMA num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter a_shape = (batch_size, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) stride = (wl.hstride, wl.wstride) data_dtype = data.dtype kernel_dtype = kernel.dtype acc_dtype = env.acc_dtype assert wl.hpad == wl.wpad padding = wl.hpad @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) a_np = np.abs(a_np) w_np = np.abs(w_np) b_np = topi.testing.conv2d_nchw_python( a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) return a_np, w_np, b_np def verify(s, check_correctness): mod = vta.build(s, [data, kernel, bias, res], "ext_dev", env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.ext_dev(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") bias_orig = np.abs(bias_orig) data_packed = data_orig.reshape( batch_size//env.BATCH, env.BATCH, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) kernel_packed = kernel_orig.reshape( wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) bias_packed = bias_orig.reshape( 1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_packed, ctx) kernel_arr = tvm.nd.array(kernel_packed, ctx) bias_arr = tvm.nd.array(bias_packed, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) res_unpack = res_arr.asnumpy().transpose( (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref += bias_orig.reshape(wl.out_filter, 1, 1) res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost def conv_normal(print_ir): print("----- CONV2D End-to-End Test-------") with vta.build_config(): s = vta.top.schedule_packed_conv2d([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10 ** 9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) conv_normal(False)
def tanh_split_input_by_val(shape, input_x, symbol): """ split input into two tensor by 0.5 shape : tensor shape input_x : tensor symbol : tensor symbol return: res, operations, scope """ res = {} operation = {} scope = {} dtype_x = input_x.dtype const_zero = tvm.const(0.0, dtype="float16") const_0 = tvm.const(0.5, dtype="float16") key = "input_abs_" + symbol input_abs = tvm.compute(shape, lambda *i: tvm.abs(input_x(*i)), name=key) res[key] = input_abs operation[key] = "vector_abs" scope[key] = cce.scope_ubuf # vcmp only support fp16 if dtype_x == "float32": key = "cmp_val_fp16_" + symbol cmp_val_fp16 = tvm.compute( shape, lambda *i: topi.cast(input_abs(*i), "float16"), name=key) res[key] = cmp_val_fp16 operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "input_val_fp16_" + symbol input_val_fp16 = tvm.compute( shape, lambda *i: topi.cast(input_x(*i), "float16"), name=key) res[key] = input_val_fp16 operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "input_gt_fp16_" + symbol input_gt_fp16 = \ tvm.compute(shape, lambda *i: tvm.select(cmp_val_fp16(*i) > const_0, input_val_fp16(*i), const_zero), name=key) res[key] = input_gt_fp16 operation[key] = "vector_select_gt" scope[key] = cce.scope_ubuf key = "input_lt_fp16_" + symbol input_lt_fp16 = \ tvm.compute(shape, lambda *i: tvm.select(cmp_val_fp16(*i) <= const_0, input_val_fp16(*i), const_zero), name=key) res[key] = input_lt_fp16 operation[key] = "vector_select_le" scope[key] = cce.scope_ubuf key = "input_gt_" + symbol input_gt = tvm.compute( shape, lambda *i: topi.cast(input_gt_fp16(*i), "float32"), name=key) res[key] = input_gt operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "input_lt_" + symbol input_lt = tvm.compute( shape, lambda *i: topi.cast(input_lt_fp16(*i), "float32"), name=key) res[key] = input_lt operation[key] = "vector_conv" scope[key] = cce.scope_ubuf else: key = "input_gt_" + symbol input_gt = tvm.compute( shape, lambda *i: tvm.select( input_abs(*i) > const_0, input_x(*i), const_zero), name=key) res[key] = input_gt operation[key] = "vector_select_gt" scope[key] = cce.scope_ubuf key = "input_lt_" + symbol input_lt = tvm.compute( shape, lambda *i: tvm.select( input_abs(*i) <= const_0, input_x(*i), const_zero), name=key) res[key] = input_lt operation[key] = "vector_select_le" scope[key] = cce.scope_ubuf return res, operation, scope
def tanh_compute_high_performance(shape, input_x, symbol): """ the function of tanh Parameters ---------- shape : tensor shape input_x : tensor symbol : tensor symbol Returns ------- """ res = {} operation = {} scope = {} dtype_x = input_x.dtype const_one = tvm.const(1, dtype=dtype_x) const_neg_two = tvm.const(-2, dtype=dtype_x) const_fp32_min = tvm.const(2**(-126), dtype=dtype_x) key = "input_abs_" + symbol input_abs = tvm.compute(shape, lambda *i: tvm.abs(input_x(*i)), name=key) res[key] = input_abs operation[key] = "vector_abs" scope[key] = cce.scope_ubuf key = "power_val_" + symbol power_val = tvm.compute(shape, lambda *i: input_abs(*i) * const_neg_two, name=key) res[key] = power_val operation[key] = "vector_muls" scope[key] = cce.scope_ubuf if dtype_x == "float32": key = "exp_val_fp16_" + symbol exp_val_fp16 = tvm.compute( shape, lambda *i: topi.cast(power_val(*i), "float16"), name=key) res[key] = exp_val_fp16 operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "exp_val_" + symbol exp_val = tvm.compute(shape, lambda *i: tvm.exp(exp_val_fp16(*i)), name=key) res[key] = exp_val operation[key] = "vector_exp" scope[key] = cce.scope_ubuf key = "exp_val_fp32_" + symbol exp_val_fp32 = tvm.compute( shape, lambda *i: topi.cast(exp_val(*i), "float32"), name=key) res[key] = exp_val_fp32 operation[key] = "vector_conv" scope[key] = cce.scope_ubuf exp_val_true = exp_val_fp32 else: key = "exp_val_" + symbol exp_val = tvm.compute(shape, lambda *i: tvm.exp(power_val(*i)), name=key) res[key] = exp_val operation[key] = "vector_exp" scope[key] = cce.scope_ubuf exp_val_true = exp_val key = "up_val_tmp_" + symbol up_val_tmp = tvm.compute(shape, lambda *i: exp_val_true(*i) * input_x(*i), name=key) res[key] = up_val_tmp operation[key] = "vector_mul" scope[key] = cce.scope_ubuf key = "up_val_" + symbol up_val = tvm.compute(shape, lambda *i: input_x(*i) - up_val_tmp(*i), name=key) res[key] = up_val operation[key] = "vector_sub" scope[key] = cce.scope_ubuf key = "input_tmp_" + symbol input_tmp = tvm.compute(shape, lambda *i: input_abs(*i) + const_fp32_min, name=key) res[key] = input_tmp operation[key] = "vector_adds" scope[key] = cce.scope_ubuf key = "down_val_tmp_" + symbol down_val_tmp = tvm.compute(shape, lambda *i: exp_val_true(*i) + const_one, name=key) res[key] = down_val_tmp operation[key] = "vector_adds" scope[key] = cce.scope_ubuf key = "down_val_" + symbol down_val = tvm.compute(shape, lambda *i: down_val_tmp(*i) * input_tmp(*i), name=key) res[key] = down_val operation[key] = "vector_mul" scope[key] = cce.scope_ubuf ub_rec = tvm.compute(shape, lambda *i: const_one / down_val(*i), name="ub_rec_" + symbol) res["ub_rec_" + symbol] = ub_rec operation["ub_rec_" + symbol] = "vector_rec" scope["ub_rec_" + symbol] = cce.scope_ubuf iter_num = 1 tensor_list, scope_list, emit_list = newton_iteration( shape, ub_rec, down_val, symbol, iter_num) res.update(tensor_list) operation.update(emit_list) scope.update(scope_list) newton_res = tensor_list["tmp_" + symbol + str(iter_num - 1)] ub_tanh = tvm.compute(shape, lambda *i: up_val(*i) * newton_res(*i), name="ub_tanh_" + symbol) res["ub_tanh_" + symbol] = ub_tanh operation["ub_tanh_" + symbol] = "vector_mul" scope["ub_tanh_" + symbol] = cce.scope_ubuf return res, operation, scope
def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True): data_shape = (batch_size, wl.in_filter, wl.height, wl.width) kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) res_conv = topi.nn.conv2d( data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride), dilation=(1, 1), out_dtype="int32") res = topi.right_shift(res_conv, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") # To compute number of ops, use a x2 factor for FMA num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter a_shape = (batch_size, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) stride = (wl.hstride, wl.wstride) data_dtype = data.dtype kernel_dtype = kernel.dtype acc_dtype = env.acc_dtype assert wl.hpad == wl.wpad padding = wl.hpad @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc") def get_ref_data(): a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) a_np = np.abs(a_np) w_np = np.abs(w_np) b_np = topi.testing.conv2d_nchw_python( a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) return a_np, w_np, b_np def verify(s, check_correctness): mod = tvm.build(s, [data, kernel, res], target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.cpu(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_orig, ctx) kernel_arr = tvm.nd.array(kernel_orig, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, res_arr) res_unpack = res_arr.asnumpy() if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost def conv_normal(print_ir): print("----- CONV2D CPU End-to-End Test-------") s = topi.generic.schedule_conv2d_nchw([res]) if print_ir: print(tvm.lower(s, [data, kernel, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10 ** 9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) conv_normal(False)
def run_gemm(env, remote, target, batch_size, in_feat, out_feat, check_correctness=True, print_ir=True, samples=4): # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False elif "vta" in target.keys: data_pack = True # Derive shapes depending upon packing a_shape = (batch_size, in_feat) w_shape = (out_feat, in_feat) if data_pack: data_shape = (batch_size//env.BATCH, in_feat//env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) fcompute = vta.top.dense_packed fschedule = vta.top.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape fcompute = topi.x86.dense_nopack fschedule = topi.x86.schedule_dense_nopack data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: res = fcompute( data, kernel, None, env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops num_ops = 2 * batch_size * in_feat * out_feat # @memoize("vta.tests.test_benchmark_topi.dense.verify") def get_ref_data(): # derive min max for act, wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( batch_size//env.BATCH, env.BATCH, in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) kernel_np = kernel_np.reshape( out_feat//env.BLOCK_OUT, env.BLOCK_OUT, in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") else: mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") temp = util.tempdir() mod.save(temp.relpath("dense.o")) remote.upload(temp.relpath("dense.o")) f = remote.load_module("dense.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("dense", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads(remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.reshape(batch_size, out_feat) res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10 ** 9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats