def test_tensor_comm_reducer(): m = te.size_var("m") n = te.size_var("n") A = te.placeholder((m, n), name="A") k = te.reduce_axis((0, n), "k") mysum = te.comm_reducer(lambda x, y: x + y, lambda t: tvm.tir.const(0, dtype=t)) C = te.compute((m,), lambda i: mysum(A[i, k], axis=k))
def test_rfactor_argmax(): def fcombine(x, y): lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) return lhs, rhs def fidentity(t0, t1): return tvm.tir.const(-1, t0), tvm.te.min_value(t1) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") nn = 1027 mm = 10 n = tvm.runtime.convert(nn) m = tvm.runtime.convert(mm) A0 = te.placeholder((m, n), name="A0", dtype="int32") A1 = te.placeholder((m, n), name="A1", dtype="float32") k = te.reduce_axis((0, n)) B0, B1 = te.compute((m, ), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name="B") # schedule s = te.create_schedule(B0.op) nthread = 16 ko, kf = s[B0].split(k, factor=nthread) BF0, BF1 = s.rfactor(B0, kf) bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread) s[B0].bind(bx, te.thread_axis("blockIdx.x")) s[B0].bind(ty, te.thread_axis("threadIdx.y")) tx = s[B0].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") s[B0].bind(tx, thread_x) s[BF0.op].compute_at(s[B0], tx) s[B0].set_store_predicate(thread_x.var.equal(0)) def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return fapi = tvm.lower(s, args=[A0, A1, B0, B1]) fargmax = tvm.build(fapi, target=device, name="argmax") np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) np_val = np.random.uniform(size=(mm, nn)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) check_target("cuda") check_target("vulkan") check_target("rocm")
def test_warp_reduction2(): def fcombine(x, y): return x[0] + y[0], x[1] * y[1] def fidentity(t0, t1): return tvm.tir.const(0, t0), tvm.tir.const(1, t1) add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer") # compute m = 16 n = 256 A0 = te.placeholder((m, n), name="A0", dtype="float32") A1 = te.placeholder((m, n), name="Al", dtype="float32") k = te.reduce_axis((0, n), "k") T0, T1 = te.compute((m, ), lambda i: add_mul_reducer( (A0[i, k], A1[i, k]), axis=k), name="T") nthdx, nthdy = 32, 2 block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, nthdx), "threadIdx.x") thread_y = te.thread_axis((0, nthdy), "threadIdx.y") def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return # schedule s = te.create_schedule(T0.op) ko, _ = s[T0].split(k, nparts=nthdx) xo, xi = s[T0].split(s[T0].op.axis[0], factor=nthdy) s[T0].bind(ko, thread_x) s[T0].bind(xi, thread_y) s[T0].bind(xo, block_x) # validation dev = tvm.device(device, 0) a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype) a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype) t0_np = np.zeros((m, ), dtype=A0.dtype) t1_np = np.zeros((m, ), dtype=A1.dtype) a0 = tvm.nd.array(a0_np, dev) a1 = tvm.nd.array(a1_np, dev) t0 = tvm.nd.array(t0_np, dev) t1 = tvm.nd.array(t1_np, dev) func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction") func(a0, a1, t0, t1) t0_np = np.sum(a0_np, axis=1) t1_np = np.product(a1_np, axis=1) tvm.testing.assert_allclose(t0.numpy(), t0_np, rtol=1e-3, atol=1e-3) tvm.testing.assert_allclose(t1.numpy(), t1_np, rtol=1e-3, atol=1e-3) check_target("cuda") check_target("rocm")
def common_reduce(name, args=(0,)): if not isinstance(args, tuple) and not isinstance(args, list): args = (args, ) def reduce_op(x, y): assert x.dtype == y.dtype , "Reduing elements that don't have same data type: %s v.s. %s" % (x.dtype, y.dtype) return tir.call_pure_extern(x.dtype, name, x, y, *args[1:]) return te.comm_reducer(reduce_op, lambda t: tir.const(args[0], dtype=t), name=name)
def f(n): rv = te.reduce_axis((0, n)) init = lambda dtype: tvm.tir.Select(n > 1, tvm.tir.const(0, dtype), n.astype(dtype)) sum = te.comm_reducer( lambda x, y: tvm.te.max(x + y, n.astype("float32")), init, name="sum") return sum(X[rv], axis=rv)
def test_tensor_reduce_multiout_with_cond(): def fcombine(x, y): return x[0] + y[0], x[1] + y[1] def fidentity(t0, t1): return tvm.tir.const(0, t0), tvm.tir.const(1, t1) mysum = te.comm_reducer(fcombine, fidentity, name="mysum") m = te.var("m") n = te.var("n") idx = te.placeholder((m, n), name="idx", dtype="int32") val = te.placeholder((m, n), name="val", dtype="int32") k = te.reduce_axis((0, n), "k") cond = te.floormod(k, 2) == 0 T0, T1 = te.compute((m,), lambda i: mysum((idx[i, k], val[i, k]), axis=k, where=cond), name="T")
def test_argmax(): def fcombine(x, y): lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) return lhs, rhs def fidentity(t0, t1): return tvm.tir.const(-1, t0), tvm.te.min_value(t1) argmax = te.comm_reducer(fcombine, fidentity, name='argmax') m = te.size_var('m') n = te.size_var('n') idx = te.placeholder((m, n), name='idx', dtype='int32') val = te.placeholder((m, n), name='val', dtype='float32') k = te.reduce_axis((0, n), 'k') T0, T1 = te.compute((m, ), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name='T') s = te.create_schedule(T0.op) def check_target(): device = 'cpu' if not tvm.runtime.enabled(device): print("skip because %s is not enabled.." % device) return ctx = tvm.context(device, 0) fapi = tvm.lower(s, args=[idx, val, T0, T1]) fargmax = tvm.build(fapi, target='llvm', name="argmax") mm = 12 nn = 16 np_idx = np.repeat(np.arange(nn, dtype='int32').reshape(1, nn), mm, axis=0) np_val = np.random.uniform(size=(mm, nn)).astype('float32') np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, ctx) nd_val = tvm.nd.array(np_val, ctx) nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx) nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.asnumpy()) check_target()
def test_argmax(): def fcombine(x, y): lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) return lhs, rhs def fidentity(t0, t1): return tvm.tir.const(-1, t0), tvm.te.min_value(t1) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") m = te.size_var("m") n = te.size_var("n") idx = te.placeholder((m, n), name="idx", dtype="int32") val = te.placeholder((m, n), name="val", dtype="float32") k = te.reduce_axis((0, n), "k") T0, T1 = te.compute((m, ), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T") s = te.create_schedule(T0.op) def check_target(): device = "cpu" if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return dev = tvm.device(device, 0) fapi = tvm.lower(s, args=[idx, val, T0, T1]) fargmax = tvm.build(fapi, target="llvm", name="argmax") mm = 12 nn = 16 np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) np_val = np.random.uniform(size=(mm, nn)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) check_target()
def test_argmax(): """Test argmax.""" def fcombine(tensor_x, tensor_y): lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0]) rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1]) return lhs, rhs def fidentity(tensor1, tensor2): return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") size_var_m = te.size_var("m") size_var_n = te.size_var("n") idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32") val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32") axis_k = te.reduce_axis((0, size_var_n), "k") result_t0, result_t1 = te.compute( (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T" ) schedule = te.create_schedule(result_t0.op) def check_target(): device = "cpu" if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return dev = tvm.device(device, 0) fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1]) fargmax = tvm.build(fapi, target="llvm", name="argmax") height = 12 width = 16 np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0) np_val = np.random.uniform(size=(height, width)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev) nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) check_target()
def _pool(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype("int32") roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[ 3], roi[4] roi_start_h = te.round(roi_start_h * spatial_scale).astype("int32") roi_start_w = te.round(roi_start_w * spatial_scale).astype("int32") roi_end_h = te.round(roi_end_h * spatial_scale).astype("int32") roi_end_w = te.round(roi_end_w * spatial_scale).astype("int32") # force malformed ROIs to be 1x1 roi_h = tvm.te.max(roi_end_h - roi_start_h + 1, tvm.tir.const(1, "int32")) roi_w = tvm.te.max(roi_end_w - roi_start_w + 1, tvm.tir.const(1, "int32")) bin_h = roi_h.astype(dtype) / pooled_size_h bin_w = roi_w.astype(dtype) / pooled_size_w # use epsilon to prevent floating point precision loss in floor/ceil epsilon = tvm.tir.const(0.00001, dtype) hstart = te.floor(ph * bin_h + epsilon).astype("int32") wstart = te.floor(pw * bin_w + epsilon).astype("int32") hend = te.ceil((ph + 1) * bin_h - epsilon).astype("int32") wend = te.ceil((pw + 1) * bin_w - epsilon).astype("int32") hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height) wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width) hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height) wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width) non_empty = tvm.tir.all(hstart < hend, wstart < wend) min_value = lambda dtype: tvm.tir.if_then_else( non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype)) # pylint: disable=unnecessary-lambda _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y), min_value, name="max") rh = te.reduce_axis((0, hend - hstart), "rh") rw = te.reduce_axis((0, wend - wstart), "rw") return _max(data[batch_index, c, hstart + rh, wstart + rw], axis=[rh, rw])
def te_argmax_val_idx(): def f_combine(x, y): lhs = tvm.tir.Select((x[0] >= y[0]), x[0], y[0]) rhs = tvm.tir.Select((x[0] >= y[0]), x[1], y[1]) return lhs, rhs def f_identity(dtype0: tvm.DataType, dtype1: tvm.DataType): return tvm.te.min_value(dtype0), tvm.tir.const(-1, dtype1) argmax = te.comm_reducer(f_combine, f_identity, name="argmax") m = te.var("m") n = te.var("n") val = te.placeholder((m, n), name="val", dtype="float32") idx = te.placeholder((m, n), name="idx", dtype="int32") k = te.reduce_axis((0, n), "k") max_val, max_idx = te.compute( (m,), lambda i: argmax((val[i, k], idx[i, k]), axis=k), name="argmax" ) return [val, idx, max_val, max_idx]
def test_inline_multi_reduce(): def argmax_comp(x, y): idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) return idx, val def argmax_init(idx_typ, val_typ): return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ) argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax") m = te.var("m") n = te.var("n") val = te.placeholder((m, n), name="val", dtype="float32") val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1") val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2") k = te.reduce_axis((0, n), "k") T_idx, T_val = te.compute((m, ), lambda i: argmax((k.var, val2[i, k]), axis=k), name="T") s = te.create_schedule(T_idx.op) s[val1].compute_inline() s = s.normalize() bounds = tvm.te.schedule.InferBound(s) stmt = tvm.te.schedule.ScheduleOps(s, bounds)
def test_tensor_comm_reducer_overload(): m = te.size_var("m") n = te.size_var("n") mysum = te.comm_reducer(lambda x, y: x + y, lambda t: tvm.tir.const(0, dtype=t)) sum_res = mysum(m, n)
def test_warp_reduction2(): """Test warp reductions.""" def fcombine(tensor1, tensor2): return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1] def fidentity(tensor1, tensor2): return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2) add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer") # compute num_m = 16 num_n = 256 placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32") placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32") axis_k = te.reduce_axis((0, num_n), "k") result0, result1 = te.compute( (num_m,), lambda i: add_mul_reducer( (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k ), name="T", ) nthdx, nthdy = 32, 2 block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, nthdx), "threadIdx.x") thread_y = te.thread_axis((0, nthdy), "threadIdx.y") def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return # schedule schedule = te.create_schedule(result0.op) axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx) axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy) schedule[result0].bind(axis_ko, thread_x) schedule[result0].bind(axis_xi, thread_y) schedule[result0].bind(axis_xo, block_x) # validation dev = tvm.device(device, 0) a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype) a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype) t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype) t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype) buff_a0 = tvm.nd.array(a0_np, dev) buff_a1 = tvm.nd.array(a1_np, dev) buff_t0 = tvm.nd.array(t0_np, dev) buff_t1 = tvm.nd.array(t1_np, dev) func = tvm.build( schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction" ) func(buff_a0, buff_a1, buff_t0, buff_t1) t0_np = np.sum(a0_np, axis=1) t1_np = np.product(a1_np, axis=1) tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3) tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3) check_target("cuda") check_target("rocm")
def test_basic_operation(): np.random.seed(0) shape = (10, 10) x = te.var("x", dtype='float32') k = te.reduce_axis((0, 10), name="k") l = te.reduce_axis((0, 10), name="l") A0 = te.placeholder(shape, name='A0') A1 = te.placeholder(shape, name='A1') zeros = np.zeros(shape) B = te.compute(shape, lambda i, j: A0[i, j], name='B') check_grad(B, [A0]) B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name='B') check_grad(B, [A0, A1]) B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name='B') check_grad(B, A0) B = te.compute( shape, lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]), name='B') check_grad(B, A0, data_range=(0.1, 10)) B = te.compute(shape, lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]), name='B') check_grad(B, A0, data_range=(-4, 4)) B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name='B') check_grad(B, A0) B = te.compute((10, ), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]), name='B') check_grad(B, [A0, A1]) B = te.compute(shape, lambda i, j: te.sum( A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k), name='B') check_grad(B, A0) def fcombine(x, y): return x * y def fidentity(t0): return tvm.tir.const(1, t0) prod = te.comm_reducer(fcombine, fidentity, name='prod') B = te.compute((10, 10), lambda i, j: prod(A0[i, k] + A0[k, i], axis=k), name='B') check_grad(B, A0) X = te.placeholder((10, ), name='X') A = te.compute((10, ), lambda i: X[i] + X[9 - i]) B = te.compute((10, ), lambda i: X[i] * X[9 - i]) Y = topi.tensordot(A, B, 1) check_grad(Y, X)
B0, B1 = te.compute((m, n), lambda i, j: (A0[i,j]+2, A1[i,j]*3), name='B') s = te.create_schedule(B0.op) print(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True)) # x and y are the operands of reduction, both of them is a tuple of index # and value. def fcombine(x, y): lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) return lhs, rhs # our identity element also need to be a tuple, so `fidentity` accepts # two types as inputs. def fidentity(t0, t1): return tvm.tir.const(-1, t0), tvm.te.min_value(t1) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") # describe the reduction computation m = te.var("m") n = te.var("n") idx = te.placeholder((m, n), name="idx", dtype="int32") val = te.placeholder((m, n), name="val", dtype="int32") k = te.reduce_axis((0, n), "k") T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T") # the generated IR code would be: s = te.create_schedule(T0.op) print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True))
print(tvm.lower(s, [Input, Filter, Output], simple_mode=True)) ###################################################################### # .. _general-reduction: # # Define General Commutative Reduction Operation # ---------------------------------------------- # Besides the built-in reduction operations like :any:`te.sum`, # :any:`tvm.te.min` and :any:`tvm.te.max`, you can also define your # commutative reduction operation by :any:`te.comm_reducer`. # n = te.var("n") m = te.var("m") product = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="product") A = te.placeholder((n, m), name="A") k = te.reduce_axis((0, m), name="k") B = te.compute((n, ), lambda i: product(A[i, k], axis=k), name="B") ###################################################################### # .. note:: # # Sometimes we would like to perform reduction that involves multiple # values like :code:`argmax`, which can be done by tuple inputs. # See :ref:`reduction-with-tuple-inputs` for more detail. ###################################################################### # Summary # -------
def test_rfactor_argmax(): """Test rfactor argmax""" def fcombine(tensor0, tensor1): lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0]) rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1]) return lhs, rhs def fidentity(tensor0, tensor1): return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") num_width = 1027 num_height = 10 width = tvm.runtime.convert(num_width) height = tvm.runtime.convert(num_height) placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32") placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32") axis_k = te.reduce_axis((0, width)) result_b0, result_b1 = te.compute( (height,), lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k), name="B", ) # schedule schedule = te.create_schedule(result_b0.op) nthread = 16 _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread) rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf) axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread) schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x")) schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y")) axis_tx = schedule[result_b0].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") schedule[result_b0].bind(axis_tx, thread_x) schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx) schedule[result_b0].set_store_predicate(thread_x.var.equal(0)) def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1]) fargmax = tvm.build(fapi, target=device, name="argmax") np_idx = np.repeat( np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0 ) np_val = np.random.uniform(size=(num_height, num_width)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev) nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) check_target("cuda") check_target("vulkan") check_target("rocm")
def measure_bandwidth_sum( total_item, item_per_thread, stride, base_type, bits, lanes, target, target_host, remote, dev, n_times, ): """measure memory bandwidth of gpu by product reduction for a given type The IR for measurement is for each thread for i in 1..num_per_thread: y[global_id] = y[global_id] * x[base + i * stride] Parameters ---------- total_item: int number of elements in input array item_per_thread: int number of elements each thread accumulates stride: int stride in memory access base_type: str can be "int", "float" bits: int can be 16, 32 lanes: int lane of the vector type, can be 1, 2, 4, 8, 16 target: :any:`tvm.target.Target` the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target dev: Device the device of array remote: tvm.rpc.RPCSession remote rpc session n_times: int number of runs for taking mean Returns ------- GBPS: float gigabyte per second """ target, target_host = Target.check_and_update_host_consist(target, target_host) n, m = total_item, item_per_thread n //= lanes base_type = str(base_type) + str(bits) dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) k = te.reduce_axis((0, m), name="k") x = te.placeholder((n,), dtype=dtype, name="x") op = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="sum") y = te.compute( (n // m,), lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k) ) s = te.create_schedule(y.op) yo, yi = s[y].split(y.op.axis[0], target.max_num_threads) s[y].bind(yo, te.thread_axis("blockIdx.x")) s[y].bind(yi, te.thread_axis("threadIdx.x")) s[y].unroll(k) try: func = tvm.build(s, [x, y], target) x = tvm.nd.empty((n,), dtype=dtype, device=dev) y = tvm.nd.empty((n // m,), dtype=dtype, device=dev) func = _convert_to_remote(func, remote) time_f = func.time_evaluator(func.entry_name, dev, number=n_times) time = time_f(x, y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) return -1 return 1.0 * (total_item * bits / 8) / 1e9 / time