def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"): # Build the logic and compile the function A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return with tvm.target.create(device): s = topi.generic.schedule_broadcast(C) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"): # Build the logic and compile the function A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_broadcast(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) check_device("vulkan") check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"): global TASK TASK = "bcast_binary_" + typ + "_lhs" +\ "_".join([str(ele) for ele in lhs_shape]) +\ "rhs" + "_".join([str(ele) for ele in rhs_shape]) A = tvm.te.placeholder(shape=lhs_shape, name="A") B = tvm.te.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError s = topi.cuda.schedule_broadcast(C) fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) lhs_nd = tvm.nd.array(lhs_npy, tvm.gpu()) rhs_nd = tvm.nd.array(rhs_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"): global TASK TASK = "bcast_binary_" + typ + "_lhs" +\ "_".join([str(ele) for ele in lhs_shape]) +\ "rhs" + "_".join([str(ele) for ele in rhs_shape]) A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError s = topi.cuda.schedule_broadcast(C) fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) lhs_nd = tvm.nd.array(lhs_npy, tvm.gpu()) rhs_nd = tvm.nd.array(rhs_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)