def run(dtype): size_var_n = te.size_var("n") placeholder_a = te.placeholder((size_var_n, ), name="A", dtype=dtype) placeholder_b = te.placeholder((size_var_n, ), name="B", dtype=dtype) result_c = te.compute( placeholder_a.shape, lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)), name="C") schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 8 axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread) def check_device(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return target = tvm.target.Target(device) if "cpu" not in target.keys: schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x")) schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x")) fmod = tvm.build(schedule, [placeholder_a, placeholder_b, result_c], device, name="myfmod") # launch the kernel. value_n = 1024 a_np = (np.random.uniform(size=value_n) * 256).astype( placeholder_a.dtype) b_np = (np.random.uniform(size=value_n) * 256).astype( placeholder_b.dtype) # "fix" the values in a and b to avoid the result being too small b_np += (b_np < 2.0) * 2 a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1 buff_a = tvm.nd.array(a_np, dev) buff_b = tvm.nd.array(b_np, dev) buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev) ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1) _ = ftimer(buff_a, buff_b, buff_c).mean np.testing.assert_allclose(buff_c.numpy(), np.mod(buff_a.numpy(), buff_b.numpy()), rtol=1e-5) check_device("cuda") check_device("opencl -device=intel_graphics") check_device("metal")
def run(dtype): n = te.size_var("n") A = te.placeholder((n, ), name="A", dtype=dtype) B = te.placeholder((n, ), name="B", dtype=dtype) C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name="C") s = te.create_schedule(C.op) # create iter var and assign them tags. num_thread = 8 bx, tx = s[C].split(C.op.axis[0], factor=num_thread) def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return target = tvm.target.Target(device) if "cpu" not in target.keys: s[C].bind(bx, te.thread_axis("blockIdx.x")) s[C].bind(tx, te.thread_axis("threadIdx.x")) fmod = tvm.build(s, [A, B, C], device, name="myfmod") # launch the kernel. n = 1024 a_np = (np.random.uniform(size=n) * 256).astype(A.dtype) b_np = (np.random.uniform(size=n) * 256).astype(B.dtype) # "fix" the values in a and b to avoid the result being too small b_np += (b_np < 2.0) * 2 a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1 a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) ftimer = fmod.time_evaluator(fmod.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean # fmod(a, b, c) np.testing.assert_allclose(c.asnumpy(), np.mod(a.asnumpy(), b.asnumpy()), rtol=1e-5) check_device("cuda") check_device("opencl -device=intel_graphics") check_device("metal")