def test_llvm_persist_parallel(): n = 128 A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=8) xo1, xo2 = s[C].split(xo, nparts=1) s[B].compute_at(s[C], xo1) s[B].parallel(s[B].op.axis[0]) s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish") s[C].parallel(xi) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xi, "parallel_stride_pattern") def check_llvm(): if not tvm.module.enabled("llvm"): return # BUILD and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), np.sqrt(a.asnumpy() + 1) * 2 + 2, rtol=1e-5) check_llvm()
def test_llvm_persist_parallel(): n = 128 A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=8) xo1, xo2 = s[C].split(xo, nparts=1) s[B].compute_at(s[C], xo1) s[B].parallel(s[B].op.axis[0]) s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish") s[C].parallel(xi) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xi, "parallel_stride_pattern") def check_llvm(): if not tvm.module.enabled("llvm"): return # BUILD and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), np.sqrt(a.asnumpy() + 1) * 2 + 2, rtol=1e-5) check_llvm()
def sqrt(x): """Take square root of input x. Parameters ---------- x : tvm.Tensor Input argument. Returns ------- y : tvm.Tensor The result. """ return tvm.compute(x.shape, lambda *i: tvm.sqrt(x(*i)))
def sqrt(x): """Take square root of input x. Parameters ---------- x : tvm.Tensor Input argument. Returns ------- y : tvm.Tensor The result. """ return tvm.compute(x.shape, lambda *i: tvm.sqrt(x(*i)))
def sort_ir(data, index, output, axis, is_descend): """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU. Parameters ---------- data: Buffer 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. index : Buffer Buffer of number of valid number of boxes. output : Buffer Output buffer of indicies of sorted tensor. axis : int The axis used for sorting. is_descend : bool If the sorted data is in descending order. Returns ------- stmt : Stmt The result IR statement. """ max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_index = ib.buffer_ptr(index) p_out = ib.buffer_ptr(output) ndim = len(data.shape) assert data.dtype == "float32", "Currently only supports input dtype to be float32" assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim axis_mul_before = 1 axis_mul_after = 1 if axis < 0: axis = ndim + axis for i in range(0, ndim): if i < axis: axis_mul_before *= data.shape[i] elif i > axis: axis_mul_after *= data.shape[i] dshape = 0 for i in range(0, len(index.shape)): dshape += index.shape[i] dshape = tvm.select(dshape > axis_mul_before * axis_mul_after, dshape, axis_mul_before * axis_mul_after) sizes_temp = ib.allocate("int32", dshape, name="sizes_temp", scope="global") sizes = ib.allocate("int32", dshape, name="sizes", scope="global") temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local") temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local") data_new = ib.allocate("float32", dshape, name="data_new", scope="global") index_new = ib.allocate("int32", dshape, name="index_new", scope="global") nthread_tx = max_threads nthread_bx = dshape // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < axis_mul_before * axis_mul_after): sizes[tid] = p_index[tid] sizes_temp[tid] = p_index[tid] with ib.if_scope(tid < axis_mul_before * axis_mul_after): with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \ .astype("float32"))) + 1, name="k") as k: with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0): with ib.if_scope(k % 2 == 0): sizes[tid] += sizes_temp[tid - (tvm.const(1, "int32") << k)] sizes_temp[tid] = sizes[tid] with ib.else_scope(): sizes_temp[tid] += sizes[tid - (tvm.const(1, "int32") << k)] sizes[tid] = sizes_temp[tid] with ib.if_scope(tid < axis_mul_before * axis_mul_after): i = tid / axis_mul_after j = tid % axis_mul_after current_sort_num = p_index[tid] base_idx = i * data.shape[axis] * axis_mul_after + j with ib.for_range(0, current_sort_num, name="k") as k: full_idx = base_idx + k * axis_mul_after with ib.if_scope(tid == 0): start = 0 with ib.else_scope(): start = sizes[tid - 1] index_new[start + k] = k data_new[start + k] = p_data[full_idx] with ib.if_scope(tid < axis_mul_before * axis_mul_after): with ib.if_scope(tid == 0): start = 0 with ib.else_scope(): start = sizes[tid - 1] # OddEvenTransposeSort with ib.for_range(0, p_index[tid], name="k") as k: with ib.for_range(0, p_index[tid] - 1, name="i") as i: with ib.if_scope(i % 2 == (k & 1)): with ib.if_scope( ((data_new[i + start] < data_new[i + start + 1]) ^ is_descend) == False): temp_data[tid] = data_new[i + start] data_new[i + start] = data_new[i + start + 1] data_new[i + start + 1] = temp_data[tid] temp_index[tid] = index_new[i + start] index_new[i + start] = index_new[i + start + 1] index_new[i + start + 1] = temp_index[tid] with ib.if_scope(tid < axis_mul_before * axis_mul_after): i = tid / axis_mul_after j = tid % axis_mul_after current_sort_num = p_index[tid] base_idx = i * data.shape[axis] * axis_mul_after + j with ib.for_range(0, data.shape[axis], name="k") as k: with ib.if_scope(tid == 0): start = 0 with ib.else_scope(): start = sizes[tid - 1] p_out[base_idx + k * axis_mul_after] = tvm.select( k < current_sort_num, index_new[k + start], k) body = ib.get() return body
def BatchNorm(device="llvm", lib_path="./", ndim=None, dtype=None, optype=False, axis=None): ''' batchnorm Args: device: lib_path: ndim: dtype: optype: axis: Returns: ''' if axis >= ndim: return shape = [tvm.var("n" + str(i)) for i in range(ndim)] channel = shape[axis] eps = tvm.var("epsilon", dtype="float32") opname = optype + ("_ndim%d_%s_axis%d" % (ndim, dtype, axis)) print(opname) # define compute in_tensor = tvm.placeholder(shape, dtype=dtype, name='in_tensor') mean = tvm.placeholder((channel, ), dtype=dtype, name='mean') variance = tvm.placeholder((channel, ), dtype=dtype, name='var') scale = tvm.placeholder((channel, ), dtype=dtype, name='scale') offset = tvm.placeholder((channel, ), dtype=dtype, name='offset') variance_sqrt = tvm.compute( (channel, ), lambda i: tvm.sqrt(variance[i] + eps.astype(dtype))) if optype == "TFBatchNorm": out_tensor = tvm.compute(shape, lambda *idx: ((in_tensor[idx] - mean[idx[axis]]) / variance_sqrt[idx[axis]]) *\ scale[idx[axis]] + offset[idx[axis]]) tensor_list = [ eps, in_tensor, scale, offset, mean, variance, out_tensor ] elif optype == "CaffeBatchNorm": out_tensor = tvm.compute( shape, lambda *idx: (in_tensor[idx] - mean[idx[axis]]) / variance_sqrt[idx[axis]]) tensor_list = [eps, in_tensor, mean, variance, out_tensor] elif optype == "CaffeScale": out_tensor = tvm.compute( shape, lambda *idx: in_tensor[idx] * scale[idx[axis]] + offset[idx[axis]]) tensor_list = [in_tensor, scale, offset, out_tensor] elif optype == "TFBiasAdd": out_tensor = tvm.compute( shape, lambda *idx: in_tensor[idx] + offset[idx[axis]]) tensor_list = [in_tensor, offset, out_tensor] else: raise RuntimeError("no support for {}".format(optype)) # define schedule & generate lib s = tvm.create_schedule(out_tensor.op) Genlib(s, tensor_list, device, opname, lib_path)