def check(device, dtype, m=32, n=32): ctx = tvm.context(device, 0) if not ctx.exist or not tvm.runtime.enabled(device): print("skip because", device, "is not enabled..") return if dtype == "float16" and not have_fp16(ctx.compute_version): print("Skip because gpu does not have fp16 support") return a = te.placeholder((m, n), name="a", dtype=dtype) b = te.placeholder((m, n), name="b", dtype=dtype) c = a + b d = a * b e = topi.elemwise_sum([c, d]) g = topi.sum(e) with tvm.target.create(device): sg = topi.cuda.schedule_reduce(g) func = tvm.build(sg, [a, b, g], device) a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.random.uniform(size=(m, n)).astype(b.dtype) g_np = np.sum(np.add(a_np * b_np, a_np + b_np)) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(b_np, ctx) g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx) func(a_nd, b_nd, g_nd) tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
def verify_elemwise_sum(num_args, dtype): shape = (3, 5, 4) tvm_placeholders = [] for i in range(num_args): tvm_placeholders.append(te.placeholder(shape, name="data" + str(i), dtype=dtype)) esum = topi.elemwise_sum(tvm_placeholders) s = te.create_schedule([esum.op]) @memoize("topi.tests.test_topi_elemwise_sum") def get_ref_data(): np_nd = [np.random.uniform(0, 10, size=shape).astype(dtype) for i in range(num_args)] return np_nd np_nd = get_ref_data() def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) f = tvm.build(s, tvm_placeholders + [esum], device, name="elemwise_sum") tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out] f(*tvm_nd) np_out = np.sum(np.array(np_nd), axis=0) tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5) for device in ["llvm"]: check_device(device)
def check(device, dtype, m=32, n=32): if not tvm.testing.device_enabled(device): print("Skipping", device) return dev = tvm.device(device, 0) a = te.placeholder((m, n), name="a", dtype=dtype) b = te.placeholder((m, n), name="b", dtype=dtype) c = a + b d = a * b e = topi.elemwise_sum([c, d]) g = topi.sum(e) with tvm.target.Target(device): sg = topi.cuda.schedule_reduce(g) func = tvm.build(sg, [a, b, g], device) a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.random.uniform(size=(m, n)).astype(b.dtype) g_np = np.sum(np.add(a_np * b_np, a_np + b_np)) a_nd = tvm.nd.array(a_np, dev) b_nd = tvm.nd.array(b_np, dev) g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev) func(a_nd, b_nd, g_nd) tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
c = a + b # same as topi.broadcast_add d = a * b # same as topi.broadcast_mul ###################################################################### # Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`. ###################################################################### # Generic schedules and fusing operations # --------------------------------------- # Up to now, we have seen an example of how TOPI can save us from writing explicit computations in lower level API. # But it doesn't stop here. Still we did the scheduling as before. TOPI also provides higher level # scheduling recipes depending on a given context. For example, for CUDA, # we can schedule the following series of operations ending with :code:`topi.sum` using only # :code:`topi.generic.schedule_reduce` # e = topi.elemwise_sum([c, d]) f = e / 2.0 g = topi.sum(f) with tvm.target.cuda(): sg = topi.cuda.schedule_reduce(g) print(tvm.lower(sg, [a, b], simple_mode=True)) ###################################################################### # As you can see, scheduled stages of computation have been accumulated and we can examine them by # print(sg.stages) ###################################################################### # We can test the correctness by comparing with :code:`numpy` result as follows # func = tvm.build(sg, [a, b, g], "cuda")