def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype): if dtype == "float16": pytest.xfail("float16 is not supported.") x = te.placeholder((x_batch, M, K), name="x") y = te.placeholder((y_batch, N, K), name="y") def get_ref_data(): a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype) b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype) c_np = tvm.topi.testing.batch_matmul(a_np, b_np) return (a_np, b_np, c_np) # get the test data a_np, b_np, c_np = get_ref_data() target_hexagon = tvm.target.hexagon("v68") with tvm.target.Target(target_hexagon): fcompute = topi.nn.batch_matmul fschedule = topi.hexagon.schedule_batch_matmul out = fcompute(x, y) s = fschedule([out]) out_shape = out.shape func = tvm.build( s, [x, y, out], tvm.target.Target(target_hexagon, host=target_hexagon), name="batch_matmul", ) mod = hexagon_session.load_module(func) dev = hexagon_session.device a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev) mod["batch_matmul"](a, b, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
def check_target(target, dev): print("Running on target: %s" % target) fcompute, fschedule = tvm.topi.testing.dispatch( target, _conv3d_ncdhw_implement) with tvm.target.Target(target): C = fcompute(A, W, (stride, stride, stride), padding, (dilation, dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4, atol=1e-6)
def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation): target = tvm.target.Target(target) if target.kind.name == "vulkan" and dtype == "float64": # https://www.khronos.org/registry/SPIR-V/specs/1.0/GLSL.std.450.html pytest.xfail("Vulkan GLSL.std.450 does not support 64-bit floats") A = te.placeholder(shape, dtype=dtype, name="A") topi_op = configs[softmax_operation]["topi"] B = topi_op(A, axis=1) with tvm.target.Target(target): fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule) s = fschedule(B) a_np, b_np = ref_data a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): b = topi.vision.rcnn.roi_pool_nchw(a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale) s_func = tvm.topi.testing.dispatch(device, _roi_pool_schedule) s = s_func(b) tvm_a = tvm.nd.array(a_np, ctx) tvm_rois = tvm.nd.array(rois_np, ctx) tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx) f = tvm.build(s, [a, rois, b], device) f(tvm_a, tvm_rois, tvm_b) tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-4)
def check_device(device): ctx = tvm.context(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv3d_ndhwc_tensorcore_implement ) C = fcompute(A, W, stride, padding, dilation, "float32") if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
def conv1d_strategy(attrs, inputs, out_type, target): """conv1d generic strategy""" logger.warning("conv1d is not optimized for this platform.") layout = attrs.data_layout dilation = get_const_tuple(attrs.dilation) if dilation[0] < 1: raise ValueError("dilation should be a positive value") strategy = _op.OpStrategy() if layout == "NCW": strategy.add_implementation( wrap_compute_conv1d(topi.nn.conv1d_ncw), wrap_topi_schedule(topi.generic.schedule_conv1d_ncw), name="conv1d_ncw.generic", ) elif layout == "NWC": strategy.add_implementation( wrap_compute_conv1d(topi.nn.conv1d_nwc), wrap_topi_schedule(topi.generic.schedule_conv1d_nwc), name="conv1d_nwc.generic", ) else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy
def check_device(target, dev): print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( target, _batch_matmul_implement) out = fcompute(x, y) if not dynamic: s = fschedule([out]) out_shape = out.shape else: s = te.create_schedule(out.op) out_shape = (batch_size, M, N) if debug: print(tvm.lower(s, [x, y, out], simple_mode=True)) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev) f = tvm.build(s, [x, y, out], target, name="dense") f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def verify_relu(m, n, dtype="float32"): A = te.placeholder((m, n), name="A", dtype=dtype) B = topi.nn.relu(A) a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) def check_target(target, dev): if dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because %s does not have fp16 support" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_elemwise_schedule(target)(B) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) foo = tvm.build(s, [A, B], target, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for target, dev in tvm.testing.enabled_targets(): check_target(target, dev)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): B = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NHWC", out_dtype="int32") s = topi.x86.schedule_conv2d_nhwc_pack_int8([B]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def resize1d_shape_func(attrs, inputs, _): """ Shape function for resize2d op. """ layout = attrs.layout width_axis = channel_axis = 1 for i, letter in enumerate(layout): if letter == "N": batch_axis = i if letter == "W": width_axis = i if letter == "C": channel_axis = i size = get_const_tuple(attrs.size) return [ _resize1d_shape_func( inputs[0], convert(size), convert(batch_axis), convert(width_axis), convert(channel_axis), ) ]
def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K): dtype = "int8" out_dtype = "int8" assert x_batch == y_batch or x_batch == 1 or y_batch == 1 x = te.placeholder((x_batch, M, K), name="x", dtype=dtype) y = te.placeholder((y_batch, N, K), name="y", dtype=dtype) def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype) b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype) c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype=out_dtype) return (a_np, b_np, c_np) # get the test data a_np, b_np, c_np = get_ref_data() target_hexagon = tvm.target.hexagon("v68") with tvm.target.Target(target_hexagon): fcompute = topi.nn.batch_matmul fschedule = topi.hexagon.schedule_batch_matmul out = fcompute(x, y) s = fschedule([out]) func = tvm.build( s, [x, y, out], tvm.target.Target(target_hexagon, host=target_hexagon), name="batch_matmul_int8", ) mod = hexagon_session.load_module(func) dev = hexagon_session.device a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev) mod["batch_matmul_int8"](a, b, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
def test_dense(hexagon_session, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype, dense_ref_data): if in_dtype == "float16": pytest.xfail("float16 is not supported.") if "int" in in_dtype: tol = {"atol": 0, "rtol": 0} elif in_dtype == "float32": tol = {"rtol": 1e-5, "atol": 1e-5} A = te.placeholder((batch_size, in_dim), name="A", dtype=in_dtype) B = te.placeholder((out_dim, in_dim), name="B", dtype=in_dtype) C = te.placeholder((out_dim, ), name="C", dtype=out_dtype) a_np, b_np, c_np, d_np = dense_ref_data fcompute = topi.nn.dense fschedule = topi.hexagon.schedule_dense target_hexagon = tvm.target.hexagon("v68") with tvm.target.Target(target_hexagon): D = fcompute(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D) s = fschedule([D]) func = tvm.build(s, [A, B, C, D], tvm.target.Target(target_hexagon, host=target_hexagon), name="dense") mod = hexagon_session.load_module(func) dev = hexagon_session.device a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(c_np, dev) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev) mod["dense"](a, b, c, d) tvm.testing.assert_allclose(d.numpy(), d_np, **tol)
def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( target, _group_conv2d_nchw_implement) C = fcompute(A, W, stride, padding, dtype, output_padding, groups) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build( s, [A, W, C], target, name="group_conv2d_transpose_%d_%d_%s_%d_%s_%s_%s_%s_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding, groups, ), ) func(a, w, c) c = c.numpy() for measurement, reference in zip(c, c_np): tvm.testing.assert_allclose(measurement, reference, rtol=1e-5)
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) assert indices_or_sections > 0, "Slice count must be > 0" else: indices_or_sections = list(get_const_tuple(attrs.indices_or_sections)) assert sorted(indices_or_sections)[0] > 0 and indices_or_sections == sorted( indices_or_sections ), "split_indices must be sorted" axis = get_const_int(attrs.axis) if axis < 0: axis += get_const_int(inputs[0].shape[0]) num_out = ( indices_or_sections if isinstance(indices_or_sections, int) else len(indices_or_sections) + 1 ) param_is_indices = isinstance(indices_or_sections, int) if param_is_indices: indices_or_sections = [indices_or_sections] return [ _split_shape_func( inputs[0], convert(i), convert(indices_or_sections), convert(param_is_indices), convert(axis), ) for i in range(num_out) ]
def verify_reorg(batch, in_size, in_channel, stride): """Verify reorg operator by comparing outputs from tvm and numpy implementation""" in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A") B = topi.vision.reorg(A, stride) a_shape = get_const_tuple(A.shape) dtype = A.dtype def get_ref_data_reorg(): a_np = np.random.uniform(size=a_shape).astype(dtype) b_np = tvm.topi.testing.reorg_python(a_np, stride) return a_np, b_np a_np, b_np = get_ref_data_reorg() def check_device(device): """Cheching devices is enabled or not""" ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): s_func = tvm.topi.testing.dispatch(device, _reorg_schedule) s = s_func([B]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, B], device) func(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ["llvm", "cuda"]: check_device(device)
def crop_and_resize_func(attrs, inputs, _): """ Shape function for crop_and_resize op. """ layout = attrs.layout height_axis = width_axis = channel_axis = 1 for i, letter in enumerate(layout): if letter == "H": height_axis = i if letter == "W": width_axis = i if letter == "C": channel_axis = i crop_size = get_const_tuple(attrs.crop_size) return [ _crop_and_resize_func( inputs[0], inputs[1], convert(crop_size), convert(height_axis), convert(width_axis), convert(channel_axis), ) ]
def squeeze_shape_func(attrs, inputs, _): """ Shape function for squeeze op. """ axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis) keep_axes = [] remove_axes = [] if axis is not None: for i in range(inputs[0].shape[0].value): if i not in axis: keep_axes.append(i) else: remove_axes.append(i) # Due to current relay type system, it is possible even # a static kernel function needs shape function. To handle # this case, we allow axis to be None in squeeze shape func # for now. # TODO(kevinthesun): Enhance relay type system to avoid this. if keep_axes: out = _squeeze_shape_func(inputs[0], convert(keep_axes), convert(remove_axes)) else: out = te.compute((), lambda *indices: 0) return [out]
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _roi_align_implement) b = fcompute( a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio, mode=mode, ) s = fschedule(b) tvm_a = tvm.nd.array(a_np, ctx) tvm_rois = tvm.nd.array(rois_np, ctx) tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx) f = tvm.build(s, [a, rois, b], device) f(tvm_a, tvm_rois, tvm_b) tvm_val = tvm_b.asnumpy() tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skipping %s becuase it is not enabled" % device) print("Running on target: %s" % device) with tvm.target.Target(device): C = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NCHW", out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
def verify_conv3d_ndhwc( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, devices="cuda", ): """Test the conv3d with tensorcore for ndhwc layout""" pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d( padding, (kernel, kernel, kernel)) padding_sum = pad_front + pad_top + pad_left + pad_back + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_depth = in_height = in_width = in_size A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name="A") W = te.placeholder((kernel, kernel, kernel, in_channel, num_filter), name="W") bias = te.placeholder((1, 1, 1, 1, num_filter), name="bias") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv3d_ndhwc.verify_conv3d_ndhwc") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv3d_ndhwc_python(a_np, dw_np, stride, padding) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv3d_ndhwc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, "float32") if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol) check_device(devices)
def verify_pool_grad(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True, add_relu=False): """verify function of pool_grad""" iw = ih kw = kh sw = sh pt, pl, pb, pr = padding A = te.placeholder((n, ic, ih, iw), name="A") B = topi.nn.pool2d( A, kernel=[kh, kw], stride=[sh, sw], dilation=[1, 1], padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, layout="NCHW", count_include_pad=count_include_pad, ) dtype = A.dtype bshape = get_const_tuple(B.shape) ashape = get_const_tuple(A.shape) if ceil_mode: assert bshape[2] == int( math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1) assert bshape[3] == int( math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1) else: assert bshape[2] == int( math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1) assert bshape[3] == int( math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1) OutGrad = te.placeholder(bshape, name="OutGrad") PoolGrad = topi.nn.pool_grad( OutGrad, A, kernel=[kh, kw], stride=[sh, sw], padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, layout="NCHW", count_include_pad=count_include_pad, ) if add_relu: PoolGrad = topi.nn.relu(PoolGrad) a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype) out_grad_np = np.random.uniform(low=0.001, size=bshape).astype(dtype) pool_grad_np = tvm.topi.testing.pool_grad_nchw( a_np, out_grad_np, pool_size=(kh, kw), strides=(sh, sw), padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad, ) if add_relu: pool_grad_np = np.maximum(pool_grad_np, 0.0) def check_target(target, dev): print("Running on target: %s" % target) with tvm.target.Target(target): s_func = tvm.topi.testing.dispatch(target, _pool_grad_schedule) s = s_func(PoolGrad) a = tvm.nd.array(a_np, dev) out_grad = tvm.nd.array(out_grad_np, dev) pool_grad = tvm.nd.array( np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), dev) f = tvm.build(s, [A, OutGrad, PoolGrad], target) f(a, out_grad, pool_grad) tvm.testing.assert_allclose(pool_grad.asnumpy(), pool_grad_np, rtol=1e-5) for target, dev in tvm.testing.enabled_targets(): check_target(target, dev)
def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype="uint32", out_dtype="int16", unipolar=True): """Bitserial dense implementation. TODO: Why are these separate Parameters ---------- data : tvm.te.Tensor 2-D with shape [batch, in_dim] weight : tvm.te.Tensor 2-D with shape [out_dim, in_dim] or 3-D with shape [out_dim, weight_bits, in_dim] Returns ------- output : tvm.te.Tensor 2-D with shape [batch, out_dim] """ data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) if len(weight.shape) == 2: weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) else: weight_packed = weight Y, DB, K = get_const_tuple(data_packed.shape) X, WB, _ = get_const_tuple(weight_packed.shape) ######## Search space x, y = cfg.axis(X), cfg.axis(Y) db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K) ko, ki = cfg.define_split("tile_k", k, num_outputs=2) yo, yi = cfg.define_split("tile_y", y, num_outputs=2) xo, xi = cfg.define_split("tile_x", x, num_outputs=2) cfg.define_reorder( "reorder_0", [yo, xo, ko, yi, wb, db, ki, xi], policy="candidate", candidate=[[yo, xo, ko, yi, wb, db, ki, xi], [yo, xo, yi, ko, wb, db, ki, xi]], ) cfg.define_annotate("ann_reduce", [db, wb], policy="try_unroll") cfg.define_annotate("ann_spatial", [yi, xi], policy="try_unroll_vec") ###### Compute rule VX = cfg["tile_x"].size[-1] wvshape = (X // VX, WB, VX, K) oshape = (Y, X) k = te.reduce_axis((0, K), name="k") db = te.reduce_axis((0, DB), name="db") wb = te.reduce_axis((0, WB), name="wb") # Tile data and weights weight_vec = te.compute( wvshape, lambda xo, wb, vx, k: weight_packed[xo * VX + vx][wb][k], name="weight_vec") idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod matmul_unipolar = te.compute( oshape, lambda i, j: te.sum( (tvm.tir.popcount(weight_vec[idxdiv( j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) - tvm. tir.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k] )).astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k], ), tag="bitserial_dense_unipolar", ) matmul = te.compute( oshape, lambda i, j: te.sum( tvm.tir.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[ i, db, k]).astype(out_dtype) << (db + wb).astype(out_dtype), axis=[wb, db, k], ), tag="bitserial_dense", ) # binary ops cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype)) if unipolar: return matmul_unipolar return matmul
def conv2d_grad(orig, grad): """Gradient of conv2d""" attrs = orig.attrs data, weight = orig.args data_shape = get_const_tuple(data.checked_type.shape) weight_shape = get_const_tuple(weight.checked_type.shape) _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape) batch, in_channel, in_h, in_w = data_shape out_channel, _, filter_h, filter_w = weight_shape # infer output_padding fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( get_const_tuple(attrs.padding), (filter_h, filter_w)) stride_h, stride_w = get_const_tuple(attrs.strides) dilation_h, dilation_w = get_const_tuple(attrs.dilation) out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w output_padding = (in_h - out_h, in_w - out_w) assert attrs.data_layout == "NCHW", "only support NCHW data layout" assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout" assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout" backward_data = _nn.conv2d_transpose( grad, weight, strides=attrs.strides, padding=attrs.padding, dilation=attrs.dilation, groups=attrs.groups, output_padding=output_padding, ) grad = tile(grad, [1, in_channel // attrs.groups, 1, 1]) grad = reshape(grad, [-1, 1, 0, 0]) # batch * oc * ic // groups, 1, oh, ow data = reshape(data, [1, -1, 0, 0]) # 1, batch * ic, ih, iw backward_weight = _nn.conv2d( data, grad, strides=attrs.dilation, padding=attrs.padding, dilation=attrs.strides, groups=in_channel * batch, ) # infer shape of backward_weight padded_weight_grad_h = (in_h - (grad_h - 1) * stride_h - 1 + fpad_top + fpad_bottom) // dilation_h + 1 padded_weight_grad_w = (in_w - (grad_w - 1) * stride_w - 1 + fpad_left + fpad_right) // dilation_w + 1 backward_weight = reshape( backward_weight, [ batch, in_channel // attrs.groups, out_channel, padded_weight_grad_h, padded_weight_grad_w, ], ) backward_weight = _sum(backward_weight, axis=0) backward_weight = transpose(backward_weight, [1, 0, 2, 3]) assert padded_weight_grad_h >= filter_h assert padded_weight_grad_w >= filter_w if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w: backward_weight = strided_slice( backward_weight, begin=[0, 0, 0, 0], end=[out_channel, in_channel // attrs.groups, filter_h, filter_w], ) return [backward_data, backward_weight]
def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True): """verify function of pool""" iw = ih kw = kh sw = sh pt, pl, pb, pr = padding layout = "NCHW" A = te.placeholder((n, ic, ih, iw), name="A") B = topi.nn.pool( A, kernel=[kh, kw], stride=[sh, sw], padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, layout="NCHW", count_include_pad=count_include_pad, ) B = topi.nn.relu(B) dtype = A.dtype bshape = get_const_tuple(B.shape) ashape = get_const_tuple(A.shape) if ceil_mode: assert bshape[2] == int( math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1) assert bshape[3] == int( math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1) else: assert bshape[2] == int( math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1) assert bshape[3] == int( math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1) a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype) pad_np = np.zeros(shape=(n, ic, ih + pt + pb, iw + pl + pr)).astype(dtype) no_zero = (range(n), range(ic), (range(pt, ih + pt)), (range(pl, iw + pl))) pad_np[np.ix_(*no_zero)] = a_np _, oc, oh, ow = get_const_tuple(B.shape) b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype) if pool_type == "avg": for i in range(oh): for j in range(ow): if count_include_pad: b_np[:, :, i, j] = np.mean(pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw], axis=(2, 3)) else: pad_count = np.sum(pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw] > 0, axis=(2, 3)) b_np[:, :, i, j] = np.sum( pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw], axis=(2, 3)) / np.maximum(pad_count, 1) elif pool_type == "max": for i in range(oh): for j in range(ow): b_np[:, :, i, j] = np.max(pad_np[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw], axis=(2, 3)) b_np = np.maximum(b_np, 0.0) def check_device(device, ctx): print("Running on target: %s" % device) with tvm.target.Target(device): s_func = tvm.topi.testing.dispatch(device, _pool_schedule) s = s_func(B, layout) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=2e-5, atol=1e-5) for device, ctx in tvm.testing.enabled_targets(): check_device(device, ctx)
def verify_group_conv2d_nchw( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False, ): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A") W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name="W") bias = te.placeholder((num_filter, 1, 1), name="bias") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(dtype) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( target, _group_conv2d_nchw_implement) C = fcompute(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, ), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, ), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) for target in ["llvm", "cuda"]: check_target(target)
def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, ), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, ), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
def verify_group_conv2d_NCHWc_int8( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False, ): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="int8") W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name="W", dtype="int8") bias = te.placeholder( (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8" ) def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(dtype) # convert to NCHWc _, _, out_height, out_width = c_np.shape c_np = c_np.reshape( (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)).transpose(0, 1, 3, 4, 2) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, ), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, ), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) for target in ["cuda"]: check_target(target)
def conv2d_winograd_nhwc_auto_scheduler_test(N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1): tile_size = 4 inputs = te.placeholder((N, H, W, CI), name="inputs") N, H, W, CI = get_const_tuple(inputs.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" KH = KW = kernel_size HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW)) HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride assert HSTR == 1 and WSTR == 1 and KH == KW data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad") r = KW m = tile_size alpha = m + r - 1 A, B, _ = winograd_transform_matrices(m, r, "float32") H = (H + 2 * HPAD - KH) // HSTR + 1 W = (W + 2 * WPAD - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW kshape = (alpha, alpha, CI, CO) kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight") idxdiv = te.indexdiv idxmod = te.indexmod # pack input tile input_tile = te.compute( (alpha, alpha, P, CI), lambda eps, nu, p, ci: data_pad[idxdiv(p, (nH * nW))][idxmod( idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu][ci], name="input_tile", ) # transform data r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") data_pack = te.compute( (alpha, alpha, P, CI), lambda eps, nu, p, ci: te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name="data_pack", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"] }, ) # do batch gemm ci = te.reduce_axis((0, CI), name="ci") bgemm = te.compute( (alpha, alpha, P, CO), lambda eps, nu, p, co: te.sum(data_pack[eps][nu][p][ci] * kernel_pack[ eps][nu][ci][co], axis=[ci]), name="bgemm", ) # inverse transform r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") inverse = te.compute( (m, m, P, CO), lambda vh, vw, p, co: te.sum( bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name="inverse", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"] }, ) # output output = te.compute( (N, H, W, CO), lambda n, h, w, co: inverse[idxmod(h, m), idxmod(w, m), n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), co], name="conv2d_winograd", ) return [inputs, kernel_pack, output]
def verify_deformable_conv2d_nchw( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, deformable_groups=1, groups=1, ): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, deformable_groups, groups, )) A = te.placeholder((batch, in_channel, in_size, in_size), name="A") out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1 Offset = te.placeholder( (batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name="offset") W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") bias = te.placeholder((num_filter, 1, 1), name="bias") a_shape = get_const_tuple(A.shape) offset_shape = get_const_tuple(Offset.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize( "topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nchw" ) def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) offset_np = np.random.randn(*offset_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np = tvm.topi.testing.deformable_conv2d_nchw_python( a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups) return a_np, offset_np, w_np, c_np a_np, offset_np, w_np, c_np = get_ref_data() def check_device(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) fcompute, fschedule = tvm.topi.testing.dispatch( device, _deformable_conv2d_nchw_implement) with tvm.target.Target(device): C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype) s = fschedule([C]) a = tvm.nd.array(a_np, dev) offset = tvm.nd.array(offset_np, dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev) func = tvm.build(s, [A, Offset, W, C], device) func(a, offset, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) for device in ["llvm", "cuda"]: check_device(device)
def test_util(): x = tvm.tir.const(100, "int32") assert utils.get_const_int(x) == 100 assert utils.get_const_tuple((x, x)) == (100, 100)