def test_simplify_if_then_else(): ck = CanonicalChecker() x = tvm.var("x") y = tvm.var("y") # simplification that takes condition into account. res = tvm.if_then_else((x * 4 + y) >= 466036, tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528), (((((x*4) + y) - 466036) % 24528) -24512) % 16, x), y) expected = tvm.if_then_else( tvm.expr.LE(466036, (x * 4 + y)), tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)), (((x*4) + y) - 4) % 16, x), y) ck.verify(res, expected) # can only simplify if condition res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3) expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 > 2, x, 0), 0) expected = tvm.expr.Select(x >= 10, x, 0) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 < 2, x, 0), 0) ck.verify(res, 0)
def get_valid_counts_scan(data, partial_in, partial): """Low level IR to do scan. Parameters ---------- data: Buffer 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. idx_in : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. idx : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. partial : Buffer 2D Buffer of valid data indices with shape [batch_size, new_range]. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] ib = tvm.ir_builder.create() partial_in = ib.buffer_ptr(partial_in) partial = ib.buffer_ptr(partial) max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) elem_per_thread = num_anchors // max_threads + 1 nthread_tx = max_threads nthread_bx = batch_size tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) var = tvm.make.node("FloatImm", dtype="float32", value=2) new_range = num_anchors // elem_per_thread + 1 iteration = log(cast(new_range, "float32")) // math.log(2) # Scan: Kogge-Stone adder with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))): with ib.for_range(0, iteration) as k: with ib.if_scope(k == 0): with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))): partial[bx * new_range + tx] = \ partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1] with ib.else_scope(): partial[bx * new_range] = partial_in[bx * new_range] with ib.else_scope(): with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \ tx < tvm.min(new_range, num_anchors))): partial[bx * new_range + tx] += \ partial[bx * new_range + tx - cast(power(var, k), "int32")] ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def test_basic(): a = tvm.var("a") b = tvm.var("b") c = tvm.var("c") m = tvm.arith.DetectClipBound(tvm.all(a * 1 < b * 6, a - 1 > 0), [a]) assert tvm.ir_pass.Simplify(m[1] - (b * 6 - 1)).value == 0 assert m[0].value == 2 m = tvm.arith.DetectClipBound(tvm.all(a * 1 < b * 6, a - 1 > 0), [a, b]) assert len(m) == 0 m = tvm.arith.DetectClipBound(tvm.all(a + 10 * c <= 20, b - 1 > 0), [a, b]) assert tvm.ir_pass.Simplify(m[1] - (20 - 10 * c)).value == 0 assert tvm.ir_pass.Simplify(m[2] - 2).value == 0
def test_copy_pad_split(): m = 4 * 3 A = tvm.placeholder((m, ), name="A") Apad = tvm.compute((m + 2,), lambda i: tvm.select(tvm.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad") B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2]) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=4) s[Apad].compute_at(s[B], xo) s[Apad].pragma(s[Apad].op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.CanonicalSimplify(stmt) def cb(src, dst, pad_before, pad_after, pad_value): assert(dst.elem_offset.value == 0) assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1) rpad_before = tvm.max(1 - xo * 4, 0) rpad_after = tvm.max(xo * 4 - 7, 0) assert_expr_equal(pad_before[0], rpad_before) assert_expr_equal(pad_after[0], rpad_after) assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after) return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def nms_ir(sorted_bbox_buf, out_buf, nms_threshold): """Non-maximum supression. Parameters ---------- sorted_bbox_buf : tvm.schedule.Buffer 3-D with shape [batch, num_bbox, 5]. The last dimension is in format of [w_start, h_start, w_end, h_end, score]. out_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed. nms_threshold : float Non-maximum suppression threshold. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0) h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0) - i return i / u batch, num_bbox = get_const_tuple(out_buf.shape) max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(sorted_bbox_buf) p_out = ib.buffer_ptr(out_buf) nthread_tx = max_threads nthread_bx = num_bbox // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = bx * max_threads + tx with ib.for_range(0, batch, for_type="unroll", name="n") as b: base_idx = b * num_bbox with ib.if_scope(i < num_bbox): p_out[base_idx + i] = False with ib.for_range(0, num_bbox - 1) as l: with ib.if_scope(tvm.all(i < num_bbox, i > l, p_out[base_idx + l] == False)): iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5) with ib.if_scope(iou > nms_threshold): p_out[base_idx + i] = True ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def select_array(i, j): now = tvm.const(0.0, dtype) for ii in range(row): for jj in range(col): now = tvm.expr.Select(tvm.all(i % row == ii, j % col == jj), tvm.const(matrix[ii][jj], dtype), now) return now
def get_valid_counts_upsweep(data, idx_in, idx, partial): """Low level IR of first step of scan: unsweep. Parameters ---------- data: Buffer 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. idx_in : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. idx : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. partial : Buffer 2D Buffer of valid data indices with shape [batch_size, new_range]. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) idx_in = ib.buffer_ptr(idx_in) idx = ib.buffer_ptr(idx) partial = ib.buffer_ptr(partial) max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) elem_per_thread = num_anchors // max_threads + 1 nthread_tx = max_threads nthread_bx = batch_size tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) new_range = num_anchors // elem_per_thread + 1 # Scan: Upsweep: with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)): with ib.for_range(0, elem_per_thread) as i: with ib.if_scope(bx * num_anchors + \ tx * elem_per_thread + i < batch_size * num_anchors): with ib.if_scope(i == 0): partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread] idx[bx * num_anchors + tx * elem_per_thread] = \ idx_in[bx * num_anchors + tx * elem_per_thread] with ib.else_scope(): partial[bx * new_range + tx] += \ idx_in[bx * num_anchors + tx * elem_per_thread + i] idx[bx * num_anchors + tx * elem_per_thread + i] = \ idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \ idx_in[bx * num_anchors + tx * elem_per_thread + i] ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): """Copy output after applying nms to continuous memory. Parameters ---------- sorted_bbox_buf : tvm.schedule.Buffer 3-D with shape [batch, num_bbox, 5]. The last dimension is in format of [w_start, h_start, w_end, h_end, score]. remove_mask_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed. out_buf : tvm.schedule.Buffer 2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of [batch_index, w_start, h_start, w_end, h_end]. Returns ------- stmt : Stmt The result IR statement. """ batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape) rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch nthread_tx = batch tx = tvm.thread_axis("threadIdx.x") ib = tvm.ir_builder.create() ib.scope_attr(tx, "thread_extent", nthread_tx) i = ib.allocate('int32', (1,), 'i', scope='local') i[0] = 0 p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf) p_remove = ib.buffer_ptr(remove_mask_buf) p_out = ib.buffer_ptr(out_buf) b = tx nkeep = ib.allocate('int32', (1,), 'nkeep', scope='local') nkeep[0] = 0 # number of bbox after nms with ib.for_range(0, num_bbox) as j: with ib.if_scope(p_remove[b * num_bbox + j] == False): nkeep[0] += 1 with ib.if_scope(nkeep[0] > 0): with ib.for_range(0, tvm.ceil( tvm.const(rpn_post_nms_top_n, 'float32') / nkeep[0]).astype('int32')): with ib.for_range(0, num_bbox) as j: offset_j = (b * num_bbox + j) * 5 offset_i = (b * rpn_post_nms_top_n + i[0]) * 5 with ib.if_scope(tvm.all(i[0] < rpn_post_nms_top_n, p_remove[(b*num_bbox+j)] == False)): p_out[offset_i] = tvm.expr.Cast('float32', b) with ib.for_range(0, 4, for_type='unroll') as k: p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k] i[0] = i[0] + 1 body = ib.get() return body
def test_schedule_bound_condition(): A = tvm.placeholder((64,), name='A', dtype="float32") Apad = tvm.compute((66,), lambda i: tvm.select(tvm.all(i>0, i < 65), A[i-1], tvm.const(0.)), name='Apad') Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2') s = tvm.create_schedule(Apad2.op) AL1 = s.cache_read(A,"local",[Apad]) s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.Simplify(stmt) assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
def test_all(): x = tvm.var('x') y = tvm.var('y') z = tvm.var('z') try: t = x and x assert False except ValueError: pass try: tvm.all() assert False except ValueError: pass assert str(tvm.all(x < y)) == '(%s < %s)' % (x.name, y.name) assert str(tvm.all(x < y, x > z)) == '((%s < %s) && (%s > %s))' % ( x.name, y.name, x.name, z.name) assert str(tvm.all(x < y, y > z + 1, x < z * 2)) == \ '(((%s < %s) && (%s > (%s + 1))) && (%s < (%s*2)))' % ( x.name, y.name, y.name, z.name, x.name, z.name)
def _dilate(*indices): not_zero = [] index_tuple = [] for i in range(n): if not equal_const_int(strides[i], 1): index_tuple.append(indices[i] // strides[i]) not_zero.append((indices[i] % strides[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype)) return data(*index_tuple)
def argsort_ir(data_buf, out_index_buf): """Batched odd-even transposition sort. Parameters ---------- data_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox] out_index_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Indices of data in sorted order. Returns ------- stmt : Stmt The result IR statement. """ batch, num_bbox = get_const_tuple(data_buf.shape) max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data_buf) index_out = ib.buffer_ptr(out_index_buf) nthread_tx = max_threads nthread_bx = (num_bbox + 1) // 2 // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("vthread") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "virtual_thread", nthread_bx) tid = bx * nthread_tx + tx temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local") temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local") with ib.for_range(0, batch, for_type="unroll") as b: start = b * num_bbox for i in range(2): bbox_id = tid * 2 + i with ib.if_scope(bbox_id < num_bbox): index_out[start + bbox_id] = bbox_id with ib.for_range(0, num_bbox) as k: offset = start + 2 * tid + (k % 2) with ib.if_scope( tvm.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])): temp_data[0] = p_data[offset] p_data[offset] = p_data[offset + 1] p_data[offset + 1] = temp_data[0] temp_index[0] = index_out[offset] index_out[offset] = index_out[offset + 1] index_out[offset + 1] = temp_index[0] ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): """declare the Im2Col method for conv2d""" _, CI, IH, IW = [x.value for x in data.shape] CO, _, KH, KW = [x.value for x in kernel.shape] HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride N = 1 OH = (IH + 2*HPAD - KH) // HSTR + 1 OW = (IW + 2*WPAD - KW) // WSTR + 1 DO_PAD = (HPAD != 0 and WPAD != 0) if DO_PAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data ALIGN = 16 def upround(x, align): return (x + align - 1) // align * align # A [CO, CI * KH * KW] reduce_len = upround(CI * KH * KW, ALIGN) A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j: kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A') # B [CI * KH * KW, N * OH * OW] B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\ tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW), data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH] [j % OW*WSTR + i % KW], tvm.const(0, data_pad.dtype)), name='B') gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1] # C [CO, N * OH * OW] k = tvm.reduce_axis((0, gemm_l), name='k') C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') # output # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment, # otherwise the alignment above will be eliminated by bound inference output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\ C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1], name='output', tag='im2col_conv_output') return output
def compute_temp(k, p, eps, nu): temp_expr = {} for j in range(4): t0 = M[0][j][k][p] + M[1][j][k][p] t1 = M[1][j][k][p] - M[2][j][k][p] temp_expr[(0, j)] = t0 + M[2][j][k][p] temp_expr[(1, j)] = t1 - M[3][j][k][p] now = tvm.const(0.0, "float32") for ii in range(2): for jj in range(4): now = tvm.select(tvm.all(eps == ii, nu == jj), temp_expr[(ii, jj)], now) return now
def _pad(*indices): not_zero = [] index_tuple = [] for i in range(n): if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0): index_tuple.append(indices[i]) else: index_tuple.append(indices[i] - pad_before[i]) not_zero.append(indices[i] >= pad_before[i]) not_zero.append(indices[i] < data.shape[i] + pad_before[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), pad_value) return data(*index_tuple)
def test_cmp_load_store(): n = 32 A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C') D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i), A(*i) > 1), name="D") def check_llvm(): if not tvm.module.enabled("llvm"): return s = tvm.create_schedule(D.op) xo, xi = s[C].split(C.op.axis[0], factor=4) xo1, xo2 = s[C].split(xo, factor=13) s[C].parallel(xo2) # BUILD and invoke the kernel. f = tvm.build(s, [A, B, D], "llvm") ctx = tvm.cpu(0) a_np = np.random.uniform(size=n).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) f(a, b, d) np.testing.assert_equal( d.asnumpy(), np.logical_and(a.asnumpy() > b.asnumpy(), a.asnumpy() > 1)) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: return s = tvm.create_schedule(D.op) for stage in [C, D]: xo, xi = s[stage].split(stage.op.axis[0], factor=4) s[stage].bind(xo, tvm.thread_axis("blockIdx.x")) s[stage].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B, D], device) a_np = np.random.uniform(size=n).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) f(a, b, d) np.testing.assert_equal( d.asnumpy(), np.logical_and(a.asnumpy() > b.asnumpy(), a.asnumpy() > 1)) check_llvm() for device in ["vulkan", "opencl", "cuda", "rocm", "metal"]: check_device(device)
def test_simplify_if_then_else(): ck = CanonicalChecker() x = tvm.var("x") y = tvm.var("y") tdiv = tvm.truncdiv tmod = tvm.truncmod # simplification that takes condition into account. res = tvm.if_then_else( (x * 4 + y) >= 466036, tvm.if_then_else(24512 <= tmod(((x * 4) + y) - 466036, 24528), tmod(tmod(((x * 4) + y) - 466036, 24528) - 24512, 16), x), y) res2 = tvm.if_then_else( (x * 4) >= 466036 - y, tvm.if_then_else(24512 <= tmod(((x * 4) + y) - 466036, 24528), tmod(tmod(((x * 4) + y) - 466036, 24528) - 24512, 16), x), y) expected = tvm.if_then_else( tvm.expr.LE(466036, (x * 4 + y)), tvm.if_then_else(tvm.expr.LE(24512, tmod(((x * 4) + y) - 4, 24528)), tmod(((x * 4) + y) - 4, 16), x), y) ck.verify(res, expected) ck.verify(res2, expected) # can only simplify if condition res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), tmod(x + y + 100, 3), tmod(x + 100, 3)) expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), tmod(x + y + 1, 3), tmod(x + 100, 3)) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(tdiv(x, 3) > 2, x, 0), 0) expected = tvm.expr.Select(x >= 10, x, 0) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(tdiv(x, 3) < 2, x, 0), 0) ck.verify(res, 0)
def dropout2d_compute (input, data, out_dtype=None): if out_dtype is None: out_dtype = input.dtype batch, species = input.shape output_data = lambda on, os: tvm.max( tvm.expr.Select( tvm.all(data[on,os] > 0.5), input[on, os].astype(out_dtype), 0.0), #(input[on, os].astype(out_dtype), relay.const(0.0)), axis=[]) return tvm.compute((batch, species), output_data, tag="dropout2d")
def _dilate(*indices): not_zero = [] index_tuple = [] idxdiv = tvm.indexdiv idxmod = tvm.indexmod for i in range(n): if not util.equal_const_int(strides[i], 1): index_tuple.append(idxdiv(indices[i], strides[i])) not_zero.append(idxmod(indices[i], strides[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype)) return data(*index_tuple)
def lrn_sqr_nchw(data, size, axis): out_dtype = data.dtype radius = size // 2 batch, in_channel, in_height, in_width = data.shape ls = tvm.reduce_axis((0, size), name='ls') sqr_out = lambda on, oc, oh, ow: tvm.sum( tvm.expr.Select( tvm.all(oc >= radius, oc < (in_channel+radius)), data[on, oc-radius+ls, oh, ow].astype(out_dtype) * data[on, oc-radius+ls, oh, ow].astype(out_dtype), 0.0) , axis=[ls]) return tvm.compute((batch, in_channel, in_height, in_width), sqr_out, tag="lrn_sqrt_op")
def test_schedule_bound_condition(): A = tvm.placeholder((64, ), name='A', dtype="float32") Apad = tvm.compute( (66, ), lambda i: tvm.select(tvm.all(i > 0, i < 65), A[i - 1], tvm.const(0.)), name='Apad') Apad2 = tvm.compute((66, ), lambda i: Apad[i] * 2, name='Apad2') s = tvm.create_schedule(Apad2.op) AL1 = s.cache_read(A, "local", [Apad]) s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.Simplify(stmt) assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
def test_const_fold3(): def check_throws(f): try: f() except tvm.TVMError: pass else: raise AssertionError("Should have raised an exception but didn't.") # Test that using ints with logic operations is forbidden x = tvm.var("x") for val in [0, 1]: for func in [tvm.all, tvm.any]: check_throws(lambda: func(tvm.const(val, 'uint1'), x)) check_throws(lambda: func(x, tvm.const(val, 'uint1'))) # Test const folding when both arguments are const for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]: for v1 in [0, 1]: for v2 in [0, 1]: assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')), tvm.const(py_func(v1, v2), 'uint1')) x = tvm.var("x", 'uint1') true = tvm.const(1, 'uint1') false = tvm.const(0, 'uint1') assert tvm.all(x, true).same_as(x) assert tvm.all(true, x).same_as(x) assert tvm.any(x, false).same_as(x) assert tvm.any(false, x).same_as(x) assert tvm.all(x, false).same_as(false) assert tvm.all(false, x).same_as(false) assert tvm.any(x, true).same_as(true) assert tvm.any(true, x).same_as(true)
def relu4d_compute(input, out_dtype=None): if out_dtype is None: out_dtype = input.dtype batch, in_channel, in_height, in_width = input.shape output_data = lambda on, oc, oh, ow: tvm.max( tvm.expr.Select(tvm.all(input[on, oc, oh, ow] > 0), input[ on, oc, oh, ow].astype(out_dtype), 0.0), #(input[on, oc, oh, ow].astype(out_dtype), relay.const(0.0)), axis=[]) return tvm.compute((batch, in_channel, in_height, in_width), output_data, tag="relu4D")
def test_cmp_load_store(): n = 32 A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C') D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i), A(*i) > 1), name="D") def check_llvm(): if not tvm.module.enabled("llvm"): return s = tvm.create_schedule(D.op) xo, xi = s[C].split(C.op.axis[0], factor=4) xo1, xo2 = s[C].split(xo, factor=13) s[C].parallel(xo2) # BUILD and invoke the kernel. f = tvm.build(s, [A, B, D], "llvm") ctx = tvm.cpu(0) a_np = np.random.uniform(size=n).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) f(a, b, d) np.testing.assert_equal( d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1)) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: return s = tvm.create_schedule(D.op) for stage in [C, D]: xo, xi = s[stage].split(stage.op.axis[0], factor=4) s[stage].bind(xo, tvm.thread_axis("blockIdx.x")) s[stage].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B, D], device) a_np = np.random.uniform(size=n).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) f(a, b, d) np.testing.assert_equal( d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1)) check_llvm() for device in ["vulkan", "opencl", "cuda", "rocm", "metal"]: check_device(device)
def dilate_kernel( *indices ): # This function is the same as topi.nn.dilate, but inlined not_zero = [] index_tuple = [] for i in range(len(dilate_args)): if not topi.util.equal_const_int(dilate_args[i], 1): index_tuple.append(indices[i] // dilate_args[i]) not_zero.append((indices[i] % dilate_args[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.select(not_zero, kernel(*index_tuple), tvm.const(0.0, data.dtype)) return kernel(*index_tuple)
def gaussian_blur2d(M, N, k, dtype="float32"): A = tvm.placeholder((M, N), dtype=dtype, name="A") pad = k // 2 number = k * k Apad = tvm.compute((M + 2 * pad, N + 2 * pad), lambda i, j: tvm.if_then_else( tvm.all(i >= pad, i < M + pad, j >= pad, j < N + pad ), A[i - pad, j - pad], 0.0), name="Apad") rx = tvm.reduce_axis((0, k), name="rx") ry = tvm.reduce_axis((0, k), name="ry") B = tvm.compute( (M, N), lambda i, j: tvm.sum(Apad[i + rx, j + ry] / number, axis=[rx, ry]), name="B") return B.op, [A, B]
def poolingb(Image, Index, POutput): """ reverse 2*2 max pooling revised Parameters ---------- Image : tvm.tensor.Tensor 4-D with shape [batch_size, image_height, image_width, in_channels] Index : tvm.tensor.Tensor, specify where Output[i,j,k,l] is from, this follows the convention of Numpy and PyTorch. You will need this tensor to compute the gradient. ------------------------------------------ For example, if Image is of shape [1, 4, 4, 1] (batch 1 and channel 1), then the slice Image[0, :, :, 0] is [[0.7243, 0.3236, 0.0124, 0.4314], [0.4104, 0.3997, 0.4534, 0.1791], [0.0973, 0.2673, 0.6907, 0.9207], [0.9268, 0.6590, 0.0312, 0.2364]] and Index is of shape [1, 2, 2, 1] and the slice Index[0, :, :, 0] is [[ 0, 6], [12, 11]] because 0 = 0 * 4 + 0 (0, 0) 6 = 1 * 4 + 2 (1, 2) 12= 3 * 4 + 0 (3, 0) 11= 2 * 4 + 3 (2, 3) -------------------------------------------- 4-D with shape [batch_size, out_height, out_width, in_channels] POutput:tvm.tensor.Tensor, gradient of Output 4-D with shape [batch_size, out_height, out_width, in_channels] Returns ------- PImage: tvm.tensor.Tensor, gradient of Image 4-D with shape (Image.shape) """ _, _, W, _ = Image.shape PImage = tvm.compute( Image.shape, lambda n, i, j, c: tvm.if_then_else( tvm.all(i == Index[n, i // 2, j // 2, c] // W, j == Index[ n, i // 2, j // 2, c] % W), POutput[n, i // 2, j // 2, c], 0.0) ) return PImage
def max_pool2d_nchw(input, pool_size, stride, padding, out_dtype=None): if out_dtype is None: out_dtype = input.dtype assert isinstance(pool_size, int) or len(pool_size) == 2 assert isinstance(stride, int) or len(stride) == 2 assert isinstance(padding, int) or len(padding) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(padding, int): pad_h = pad_w = padding else: pad_h, pad_w = padding if isinstance(pool_size, int): kernel_h = kernel_w = pool_size else: kernel_h, kernel_w = pool_size batch, in_channel, in_height, in_width = input.shape out_channel = in_channel # In Caffe, when the pooling operator is not divisible, ceil is adopted, # while the convolution operator is floor #out_height = math.ceil((in_height+2*pad_h-kernel_h)/stride_h+1) #out_width = math.ceil((in_width+2*pad_w-kernel_w)/stride_w+1) out_height = simplify((in_height + 2 * pad_h - kernel_h) // stride_h + 1) out_width = simplify((in_width + 2 * pad_w - kernel_w) // stride_w + 1) kh = tvm.reduce_axis((0, kernel_h), name='kh') kw = tvm.reduce_axis((0, kernel_w), name='kw') output_data = lambda on, oc, oh, ow: tvm.max(tvm.expr.Select( tvm.all((oh * stride_h + kh) >= pad_h, (oh * stride_h + kh) < (in_height + pad_h), (ow * stride_w + kw >= pad_w), (ow * stride_w + kw < in_width + pad_w)), input[on, oc, oh * stride_h + kh - pad_h, ow * stride_w + kw - pad_w ].astype(out_dtype), 0.0), axis=[kh, kw]) return tvm.compute((batch, out_channel, out_height, out_width), output_data, tag="max_pool2d_nchw")
def compute_output(n, k, h, w): b = n * nH * nW + (h // m) * nW + w // m eps = h % m nu = w % m output_expr = {} for i in range(2): t0 = temp[k][b][i][0] + temp[k][b][i][1] t1 = temp[k][b][i][1] - temp[k][b][i][2] output_expr[(i, 0)] = t0 + temp[k][b][i][2] output_expr[(i, 1)] = t1 - temp[k][b][i][3] now = tvm.const(0.0, "float32") for ii in range(2): for jj in range(2): now = tvm.select(tvm.all(eps == ii, nu == jj), output_expr[(ii, jj)], now) return now
def make_conv2d_unoptimized(shapeX, shapeF, tgt, tgt_host, func_name, dtype="float32"): in_size, in_size, in_channel, batch = shapeX kernel, kernel, in_channel, out_channel = shapeF pad = 1 stride = 1 A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A') W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W') out_size = (in_size - kernel + 2 * pad) // stride + 1 # Pad input Apad = tvm.compute( (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch), lambda yy, xx, cc, nn: tvm.select( tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size), A[yy - pad, xx - pad, cc, nn], tvm.const(0.)), name='Apad') # Create reduction variables rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel), name='ry') rx = tvm.reduce_axis((0, kernel), name='rx') # Compute the convolution B = tvm.compute( (out_size, out_size, out_channel, batch), lambda yy, xx, ff, nn: tvm.sum(Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc]), name='B') s = tvm.create_schedule(B.op) s[Apad].bind(Apad.op.axis[0], tvm.thread_axis("blockIdx.x")) s[Apad].bind(Apad.op.axis[1], tvm.thread_axis("threadIdx.x")) s[B].bind(B.op.axis[0], tvm.thread_axis("blockIdx.x")) s[B].bind(B.op.axis[1], tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, W, B], tgt, target_host=tgt_host, name=func_name) return _export_module(f, func_name, remote)
def conv2db(Image, Filter, POutput): """ convolution with NHWC layout backward Parameters ---------- Image : tvm.tensor.Tensor 4-D with shape [batch_size, image_height, image_width, in_channels] Filter: tvm.tensor.Tensor 4-D with shape [out_channels, in_channels, kernel_height, kernel_width] POutput:tvm.tensor.Tensor, gradient of Output 4-D with shape [batch_size, out_height, out_width, out_channels] Returns ------- PImage :tvm.tensor.Tensor, gradient of Image 4-D with shape (Image.shape) PFilter:tvm.tensor.Tensor, gradient of Filter 4-D with shape (Filter.shape) """ N, H, W, C = Image.shape K, _, Hk, Wk = Filter.shape rx = tvm.reduce_axis((0, H - (Hk - 1)), name='rx') ry = tvm.reduce_axis((0, W - (Wk - 1)), name='ry') rn = tvm.reduce_axis((0, N), name='rn') PFilter = tvm.compute( Filter.shape, lambda o, c, h, w: tvm.sum(Image[rn, h + rx, w + ry, c] * POutput[rn, rx, ry, o], axis=[rn, rx, ry])) rx_k = tvm.reduce_axis((0, Hk), name='rx_k') ry_k = tvm.reduce_axis((0, Wk), name='ry_k') ro = tvm.reduce_axis((0, K), name='ro') PImage = tvm.compute( Image.shape, lambda n, h, w, c: tvm. sum(Filter[ro, c, Hk - rx_k - 1, Wk - ry_k - 1] * tvm.if_then_else( tvm.all(h + rx_k >= Hk - 1, h + rx_k < H, w + ry_k >= Wk - 1, w + ry_k < W), POutput[n, h + rx_k - (Hk - 1), w + ry_k - (Wk - 1), ro], 0.0), axis=[rx_k, ry_k, ro])) return (PImage, PFilter)
def argsort_ir(data_buf, out_index_buf): """Batched odd-even transposition sort. Parameters ---------- data_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox] out_index_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Indices of data in sorted order. Returns ------- stmt : Stmt The result IR statement. """ batch, num_bbox = get_const_tuple(data_buf.shape) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data_buf) index_out = ib.buffer_ptr(out_index_buf) temp_data = ib.allocate("float32", (1, ), name="temp_data", scope="local") temp_index = ib.allocate("int32", (1, ), name="temp_index", scope="local") idxm = tvm.indexmod with ib.for_range(0, batch, for_type="unroll") as b: start = b * num_bbox for i in range(2): with ib.for_range(0, (num_bbox + 1) // 2) as tid: bbox_id = tid * 2 + i with ib.if_scope(bbox_id < num_bbox): index_out[start + bbox_id] = bbox_id with ib.for_range(0, num_bbox) as k: with ib.for_range(0, (num_bbox + 1) // 2) as tid: offset = start + 2 * tid + idxm(k, 2) with ib.if_scope( tvm.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])): temp_data[0] = p_data[offset] p_data[offset] = p_data[offset + 1] p_data[offset + 1] = temp_data[0] temp_index[0] = index_out[offset] index_out[offset] = index_out[offset + 1] index_out[offset + 1] = temp_index[0] return ib.get()
def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w = roi[1] * spatial_scale roi_start_h = roi[2] * spatial_scale roi_end_w = roi[3] * spatial_scale roi_end_h = roi[4] * spatial_scale roi_h = roi_end_h - roi_start_h roi_w = roi_end_w - roi_start_w roi_h = roi_h roi_w = roi_w bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w hstart = ph * bin_h wstart = pw * bin_w hend = (ph + 1) * bin_h wend = (pw + 1) * bin_w hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height - 1) wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width - 1) hend = tvm.min(tvm.max(hend + roi_start_h, 0), height - 1) wend = tvm.min(tvm.max(wend + roi_start_w, 0), width - 1) non_empty = tvm.all(hstart < hend, wstart < wend) def min_value(dtype): return tvm.expr.Select(non_empty, tvm.min_value(dtype), tvm.const(0.0, dtype)) stride_h = (hend - hstart) / 3.0 stride_w = (wend - wstart) / 3.0 hstart += stride_h wstart += stride_w stride_h = tvm.max(0.01, stride_h) stride_w = tvm.max(0.01, stride_w) _max = tvm.comm_reducer(lambda x, y: tvm.make._OpMax(x, y), min_value, name='max') rh = tvm.reduce_axis((0, tvm.expr.Select(non_empty, 2, 0)), 'rh') rw = tvm.reduce_axis((0, tvm.expr.Select(non_empty, 2, 0)), 'rw') return _max(_bilinear(batch_index, c, hstart + rh * stride_h, wstart + rw * stride_w), axis=[rh, rw])
def conv2d_batch(B, N, M, K, L, stride=1, padding=0, dtype="float32"): A = tvm.placeholder((B, N, M), dtype=dtype, name="A") W = tvm.placeholder((K, L), dtype=dtype, name="W") N_out = max(0, (N + padding * 2 - K) // stride) + 1 M_out = max(0, (M + padding * 2 - L) // stride) + 1 Apad = tvm.compute( (B, N + 2 * padding, M + 2 * padding), lambda b, i, j: tvm.if_then_else( tvm.all(i >= padding, j >= padding, i < N + padding, j < M + padding), A[b, i - padding, j - padding], 0.0), name="Apad") rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L), name="ry") Output = tvm.compute((B, N_out, M_out), lambda b, i, j: tvm.sum(Apad[b, i * stride + rx, j * stride + ry] * W[rx, ry], axis=[rx, ry]), name="Output") return Output.op, [A, W, Output]
def lrn_nchw(data, size, axis, alpha, beta, bias): #default : size = 5, axis=1, alpha=0.0001, beta=0.75, bias=1 #sqrt_out = lrn_sqrt() #pow_out = lrn_pow() #div_out = lrn_div() #return div_out out_dtype = data.dtype radius = size // 2 batch, in_channel, in_height, in_width = data.shape ls = tvm.reduce_axis((0, size), name='ls') # pad and sqrt op: output_data1 = lambda on, oc, oh, ow: tvm.sum(tvm.expr.Select( tvm.all(oc >= radius, oc < (in_channel + radius)), data[ on, oc - radius + ls, oh, ow].astype(out_dtype) * data[ on, oc - radius + ls, oh, ow].astype(out_dtype), 0.0), axis=[ls]) sqr_out = tvm.compute((batch, in_channel, in_height, in_width), output_data1, tag="lrn_sqrt_op") #return sqr_out # pow op: output_data2 = lambda on, oc, oh, ow: tvm.power( (1 + (alpha / size * sqr_out[on, oc, oh, ow].astype(out_dtype))), beta) pow_out = tvm.compute((batch, in_channel, in_height, in_width), output_data2, tag="lrn_pow_op") #return pow_op # div op: output_data3 = lambda on, oc, oh, ow: tvm.expr.Div( data[on, oc, oh, ow].astype(out_dtype), pow_out[on, oc, oh, ow].astype( out_dtype)) div_out = tvm.compute((batch, in_channel, in_height, in_width), output_data3, tag="lrn_div_op") return div_out
def compute_X_dot_A(k, b, eps, nu, kk, bb): temp_expr = {} for i in range(m): m1_add_m2 = A_T_dot_M[k][b][i][1][kk][bb] + A_T_dot_M[k][b][i][2][ kk][bb] m1_sub_m2 = A_T_dot_M[k][b][i][1][kk][bb] - A_T_dot_M[k][b][i][2][ kk][bb] m3_add_m4 = A_T_dot_M[k][b][i][3][kk][bb] + A_T_dot_M[k][b][i][4][ kk][bb] m3_sub_m4 = A_T_dot_M[k][b][i][3][kk][bb] - A_T_dot_M[k][b][i][4][ kk][bb] m5_add_m6 = A_T_dot_M[k][b][i][5][kk][bb] + A_T_dot_M[k][b][i][6][ kk][bb] m5_sub_m6 = A_T_dot_M[k][b][i][5][kk][bb] - A_T_dot_M[k][b][i][6][ kk][bb] s0 = A_T_dot_M[k][b][i][0][kk][bb] + m1_add_m2 s5 = A_T_dot_M[k][b][i][7][kk][bb] + m1_sub_m2 s1 = m1_sub_m2 + m5_sub_m6 * 16 s4 = m1_add_m2 + m3_add_m4 * 16 s2 = m1_add_m2 + 8 * m5_add_m6 s3 = m1_sub_m2 + 8 * m3_sub_m4 s0 = s0 + m5_add_m6 * 32 s5 = s5 + m3_sub_m4 * 32 s1 = s1 + m3_sub_m4 * 2 s4 = s4 + m5_add_m6 * 2 s0 = s0 + m3_add_m4 s5 = s5 + m5_sub_m6 s2 = s2 + m3_add_m4 * 4 s3 = s3 + m5_sub_m6 * 4 temp_expr[(i, 0)] = s0 temp_expr[(i, 1)] = s1 temp_expr[(i, 2)] = s2 temp_expr[(i, 3)] = s3 temp_expr[(i, 4)] = s4 temp_expr[(i, 5)] = s5 now = tvm.const(0.0, "float32") for ii in range(m): for jj in range(m): now = tvm.select(tvm.all(eps == ii, nu == jj), temp_expr[(ii, jj)], now) return now
def compute_X_dot_B(b, eps, nu, c, bb): temp_expr = {} for i in range(alpha): wd0 = B_T_dot_X[b][c][i][0][bb] - B_T_dot_X[b][c][i][6][bb] d4_sub_d2 = B_T_dot_X[b][c][i][4][bb] - B_T_dot_X[b][c][i][2][bb] wd7 = B_T_dot_X[b][c][i][7][bb] - B_T_dot_X[b][c][i][1][bb] d3_sub_d5 = B_T_dot_X[b][c][i][3][bb] - B_T_dot_X[b][c][i][5][bb] wd1 = B_T_dot_X[b][c][i][2][bb] + B_T_dot_X[b][c][i][6][bb] wd2 = B_T_dot_X[b][c][i][1][bb] + B_T_dot_X[b][c][i][5][bb] wd4 = B_T_dot_X[b][c][i][5][bb] + B_T_dot_X[b][c][i][1][bb] * 0.25 wd5 = B_T_dot_X[b][c][i][6][bb] - B_T_dot_X[b][c][i][4][bb] * 5 wd3 = B_T_dot_X[b][c][i][6][bb] + B_T_dot_X[b][c][i][2][bb] * 0.25 wd6 = B_T_dot_X[b][c][i][1][bb] + B_T_dot_X[b][c][i][5][bb] * 0.25 wd0 = wd0 + d4_sub_d2 * 5.25 wd7 = wd7 + d3_sub_d5 * 5.25 wd1 = wd1 - B_T_dot_X[b][c][i][4][bb] * 4.25 wd2 = wd2 - B_T_dot_X[b][c][i][3][bb] * 4.25 wd3 = wd3 - B_T_dot_X[b][c][i][4][bb] * 1.25 wd5 = wd5 + B_T_dot_X[b][c][i][2][bb] * 4 wd4 = wd4 - B_T_dot_X[b][c][i][3][bb] * 1.25 wd6 = wd6 - B_T_dot_X[b][c][i][3][bb] * 1.25 temp_expr[(i, 0)] = wd0 temp_expr[(i, 1)] = wd1 + wd2 temp_expr[(i, 2)] = wd1 - wd2 temp_expr[(i, 3)] = wd3 + wd4 * 2 temp_expr[(i, 4)] = wd3 - wd4 * 2 temp_expr[(i, 5)] = wd5 + wd6 * 2 temp_expr[(i, 6)] = wd5 - wd6 * 2 temp_expr[(i, 7)] = wd7 now = tvm.const(0.0, "float32") for ii in range(alpha): for jj in range(alpha): now = tvm.select(tvm.all(eps == ii, nu == jj), temp_expr[(ii, jj)], now) return now
def test_lower_floormod(): data = get_ref_data() for dtype in ["int32", "int64", "int16"]: x = tvm.var("x", dtype=dtype) y = tvm.var("y", dtype=dtype) zero = tvm.const(0, dtype) # no constraints res = lower_intrin(tvm.floormod(x, y)) check_value(res, x, y, data, lambda a, b: a % b) # rhs >= 0 res = lower_intrin(tvm.expr.Select(y >= 0, tvm.floormod(x, y), zero)) check_value(res, x, y, data, lambda a, b: a % b if b > 0 else 0) # lhs >= 0 res = lower_intrin( tvm.expr.Select(tvm.all(y >= 0, x >= 0), tvm.floormod(x, y), zero)) check_value(res, x, y, data, lambda a, b: a % b if b > 0 and a >= 0 else 0) # const power of two res = lower_intrin(tvm.floormod(x, tvm.const(8, dtype=dtype))) check_value(res, x, y, [(a, b) for a, b in data if b == 8], lambda a, b: a % b)
def conv2d_channel(N, M, C, K, L, O, stride=1, padding=0, dtype="float32"): A = tvm.placeholder((N, M, C), dtype=dtype, name="A") W = tvm.placeholder((K, L, C, O), dtype=dtype, name="W") N_out = max(0, (N + padding * 2 - K) // stride) + 1 M_out = max(0, (M + padding * 2 - L) // stride) + 1 Apad = tvm.compute( (N + 2 * padding, M + 2 * padding, C), lambda i, j, k: tvm.if_then_else( tvm.all(i >= padding, j >= padding, i < N + padding, j < M + padding), A[i - padding, j - padding, k], 0.0), name="Apad") rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L), name="ry") rc = tvm.reduce_axis((0, C), name="rc") Output = tvm.compute( (N_out, M_out, O), lambda i, j, k: tvm.sum(Apad[i * stride + rx, j * stride + ry, rc] * W[ rx, ry, rc, k], axis=[rx, ry, rc]), name="Output") return Output.op, [A, W, Output]
def compute_B_T_dot_X(b, c, eps, nu, bb): temp_expr = {} for j in range(alpha): wd0 = input_tile[b][c][0][j][bb] - input_tile[b][c][6][j][bb] d4_sub_d2 = input_tile[b][c][4][j][bb] - input_tile[b][c][2][j][bb] wd7 = input_tile[b][c][7][j][bb] - input_tile[b][c][1][j][bb] d3_sub_d5 = input_tile[b][c][3][j][bb] - input_tile[b][c][5][j][bb] wd1 = input_tile[b][c][2][j][bb] + input_tile[b][c][6][j][bb] wd2 = input_tile[b][c][1][j][bb] + input_tile[b][c][5][j][bb] wd4 = input_tile[b][c][5][j][bb] + input_tile[b][c][1][j][bb] * 0.25 wd5 = input_tile[b][c][6][j][bb] - input_tile[b][c][4][j][bb] * 5 wd3 = input_tile[b][c][6][j][bb] + input_tile[b][c][2][j][bb] * 0.25 wd6 = input_tile[b][c][1][j][bb] + input_tile[b][c][5][j][bb] * 0.25 wd0 = wd0 + d4_sub_d2 * 5.25 wd7 = wd7 + d3_sub_d5 * 5.25 wd1 = wd1 - input_tile[b][c][4][j][bb] * 4.25 wd2 = wd2 - input_tile[b][c][3][j][bb] * 4.25 wd3 = wd3 - input_tile[b][c][4][j][bb] * 1.25 wd5 = wd5 + input_tile[b][c][2][j][bb] * 4 wd4 = wd4 - input_tile[b][c][3][j][bb] * 1.25 wd6 = wd6 - input_tile[b][c][3][j][bb] * 1.25 temp_expr[(0, j)] = wd0 temp_expr[(1, j)] = wd1 + wd2 temp_expr[(2, j)] = wd1 - wd2 temp_expr[(3, j)] = wd3 + wd4 * 2 temp_expr[(4, j)] = wd3 - wd4 * 2 temp_expr[(5, j)] = wd5 + wd6 * 2 temp_expr[(6, j)] = wd5 - wd6 * 2 temp_expr[(7, j)] = wd7 now = tvm.const(0.0, "float32") for ii in range(alpha): for jj in range(alpha): now = tvm.select(tvm.all(eps == ii, nu == jj), temp_expr[(ii, jj)], now) return now
def _pool(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[ 3], roi[4] roi_start_h = tvm.round(roi_start_h * spatial_scale).astype('int32') roi_start_w = tvm.round(roi_start_w * spatial_scale).astype('int32') roi_end_h = tvm.round(roi_end_h * spatial_scale).astype('int32') roi_end_w = tvm.round(roi_end_w * spatial_scale).astype('int32') # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h + 1, tvm.const(1, 'int32')) roi_w = tvm.max(roi_end_w - roi_start_w + 1, tvm.const(1, 'int32')) bin_h = roi_h.astype(dtype) / pooled_size_h bin_w = roi_w.astype(dtype) / pooled_size_w # use epsilon to prevent floating point precision loss in floor/ceil epsilon = tvm.const(0.00001, dtype) hstart = tvm.floor(ph * bin_h + epsilon).astype('int32') wstart = tvm.floor(pw * bin_w + epsilon).astype('int32') hend = tvm.ceil((ph + 1) * bin_h - epsilon).astype('int32') wend = tvm.ceil((pw + 1) * bin_w - epsilon).astype('int32') hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height) wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width) hend = tvm.min(tvm.max(hend + roi_start_h, 0), height) wend = tvm.min(tvm.max(wend + roi_start_w, 0), width) non_empty = tvm.all(hstart < hend, wstart < wend) min_value = lambda dtype: tvm.if_then_else( non_empty, tvm.min_value(dtype), tvm.const(0.0, dtype)) # pylint: disable=unnecessary-lambda _max = tvm.comm_reducer(lambda x, y: tvm.make._OpMax(x, y), min_value, name='max') rh = tvm.reduce_axis((0, hend - hstart), 'rh') rw = tvm.reduce_axis((0, wend - wstart), 'rw') return _max(data[batch_index, c, hstart + rh, wstart + rw], axis=[rh, rw])
def conv3d(N, M, P, K, L, Q, stride=1, padding=0, dtype="float32"): A = tvm.placeholder((N, M, P), dtype=dtype, name="A") W = tvm.placeholder((K, L, Q), dtype=dtype, name="W") N_out = max(0, (N + padding * 2 - K) // stride) + 1 M_out = max(0, (M + padding * 2 - L) // stride) + 1 P_out = max(0, (P + padding * 2 - Q) // stride) + 1 Apad = tvm.compute( (N + 2 * padding, M + 2 * padding, P + 2 * padding), lambda i, j, k: tvm.if_then_else( tvm.all(i >= padding, j >= padding, k >= padding, i < N + padding, j < M + padding, k < P + padding), A[ i - padding, j - padding, k - padding], 0.0), name="Apad") rx, ry, rz = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis( (0, L), name="ry"), tvm.reduce_axis((0, Q), name="rz") Output = tvm.compute( (N_out, M_out, P_out), lambda i, j, k: tvm.sum(Apad[i * stride + rx, j * stride + ry, k * stride + rz] * W[rx, ry, rz], axis=[rx, ry, rz]), name="Output") return Output.op, [A, W, Output]
def zero_pad2d(inputs, padding=0): """Zero padding for 2d tensor Args: ----------------------------- inputs : tvm.tensor.Tensor shape [batch, channel, height, width] padding: (optional:0) int or tuple expected: (h_pad_up, h_pad_down, w_pad_up, w_pad_down) ----------------------------- Returns: ----------------------------- tvm.tensor.Tensor shape [batch, channel, padded_height, padded_width] ----------------------------- """ padding = (padding, padding, padding, padding) if isinstance( padding, (int, tvm.expr.IntImm)) else padding assert_print(isinstance(padding, tuple), "type(padding)={}".format(type(padding))) if len(padding) == 2: padding = (padding[0], padding[0], padding[1], padding[1]) assert_print(len(padding) == 4) padding_zero = 0.0 if "float" in inputs.dtype else 0 batch_size, in_channel, height, width = inputs.shape return tvm.compute( (batch_size, in_channel, height + padding[0] + padding[1], width + padding[2] + padding[3]), lambda b, c, h, w: tvm.if_then_else( tvm.all(h >= padding[0], h < height + padding[0], w >= padding[2], w < width + padding[2]), inputs[b, c, h - padding[0], w - padding[2]], padding_zero ) )
def test_copy_pad(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.compute((m + 2, l), lambda i, j: tvm.select(tvm.all(i >= 1, i < m + 1), A[i - 1, j], 1.0), name='B') s = tvm.create_schedule(B.op) s[B].pragma(B.op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) def cb(src, dst, pad_before, pad_after, pad_value): assert tvm.ir_pass.Simplify(src.elem_offset).value == 0 assert pad_before[0].value == 1 assert pad_before[1].value == 0 assert pad_after[0].value == 1 assert pad_after[1].value == 0 assert pad_value.value == 1.0 return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_cmp_simplify(): ck = RewriteChecker() x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z") # const int bound ck.verify((x % 2 + 10).equal(0), tvm.const(0, "bool")) ck.verify(tvm.expr.NE(x % 2 + 10, 0), tvm.const(1, "bool")) ck.verify(x % 2 + 10 > 1, tvm.const(1, "bool")) ck.verify(x % 2 + 10 <= 1, tvm.const(0, "bool")) ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool")) ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool")) # canonicalization ck.verify((x - 10).equal(0), x.equal(10)) ck.verify((10 - x).equal(0), x.equal(10)) ck.verify((x * y).equal(0), tvm.expr.Or(x.equal(0), y.equal(0))) # cmp bound ck.verify(x + y < x + z, y < z) ck.verify(x + y < z + x, y < z) ck.verify(y + x < x + z, y < z) ck.verify(y + x < z + x, y < z) ck.verify(y - x < z - x, y < z) ck.verify(x - y < x - z, z < y) ck.verify(x < z + x, tvm.expr.LT(0, z)) ck.verify(x < x + z, tvm.expr.LT(0, z)) ck.verify(100 < x + 1, tvm.expr.LT(99, x)) ck.verify(1 < 100 - x, tvm.expr.LT(x, 99)) ck.verify(x * 3 < y * 3, x < y) ck.verify(x * (-3) < y * (-3), y < x) ck.verify(x * 3 >= y * 3, y <= x) ck.verify(x * 4 >= 2, tvm.expr.LE(1, x)) ck.verify(x * 2 >= 50, tvm.expr.LE(25, x)) ck.verify(x / 2 < 3, x < 6) ck.verify(x * 4 <= 2, x <= 0) ck.verify(3 < x / 2, tvm.expr.LT(7, x)) ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4)) ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0)) ck.verify(x / 4 * 4 < x + y, tvm.expr.LT(0, x % 4 + y)) ck.verify(x / 4 * 4 < x - y, tvm.expr.LT(y, x % 4)) ck.verify((x + 2) / 4 * 4 >= x, tvm.expr.LE((x + 2) % 4, 2)) ck.verify((x + 2) / 4 * 4 >= x + y, tvm.expr.LE((x + 2) % 4 + y, 2)) ck.verify((x + 2) / 4 * 4 >= x - y, tvm.expr.LE((x + 2) % 4 + (-2), y)) ck.verify(tvm.min(x, 11) < 10, x < 10) ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool")) ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x)) ck.verify(x + 1 < tvm.max(8, x), x < 7) ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True) ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True) ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True) ck.verify(x < 11, tvm.const(1, "bool")) ck.verify(x <= 10, tvm.const(1, "bool")) ck.verify(z <= 5, tvm.const(1, "bool")) ck.verify(x + y <= 10, tvm.const(1, "bool")) ck.verify(x + y >= -10, tvm.const(1, "bool")) ck.verify(z - 5 <= y + 10, tvm.const(1, "bool")) ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool")) ck.verify(x*y <= 0, tvm.const(1, "bool")) ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool")) ck.verify(y*y >= 0, tvm.const(1, "bool"))
def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp_score, threshold): """Low level IR routing for transform location data preparation. Parameters ---------- cls_prob : Buffer Buffer of class probabilities. valid_count : Buffer Buffer of number of valid output boxes. temp_valid_count : Buffer Output intermediate result buffer temp_cls_id : Buffer Output intermediate result buffer temp_score : Buffer Output buffer threshold : float Threshold to be a positive prediction. Returns ------- stmt : Stmt The result IR statement. """ batch_size = cls_prob.shape[0] num_classes = cls_prob.shape[1] num_anchors = cls_prob.shape[2] ib = tvm.ir_builder.create() cls_prob = ib.buffer_ptr(cls_prob) cls_id = ib.buffer_ptr(temp_cls_id) valid_count = ib.buffer_ptr(valid_count) temp_valid_count = ib.buffer_ptr(temp_valid_count) score = ib.buffer_ptr(temp_score) threshold = tvm.make.node("FloatImm", dtype="float32", value=threshold) max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = (batch_size * num_anchors) // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < batch_size * num_anchors): i = tid / num_anchors j = tid % num_anchors valid_count[i] = 0 score[tid] = -1.0 cls_id[tid] = 0 with ib.for_range(0, num_classes - 1) as k: temp = cls_prob[i * num_classes * num_anchors + (k + 1) * num_anchors + j] cls_id[tid] = if_then_else(temp > score[tid], k + 1, cls_id[tid]) score[tid] = tvm.max(temp, score[tid]) with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)): cls_id[tid] = 0 with ib.if_scope(cls_id[tid] > 0): temp_valid_count[tid] = 1 with ib.else_scope(): temp_valid_count[tid] = 0 with ib.if_scope(tid < batch_size): with ib.for_range(0, num_anchors) as k: with ib.if_scope(k > 0): temp_valid_count[tid * num_anchors + k] += \ temp_valid_count[tid * num_anchors + k - 1] valid_count[i] = temp_valid_count[tid * num_anchors + num_anchors - 1] return ib.get()
def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \ axis_mul_after, axis, is_descend): """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting. Parameters ---------- data: Buffer Buffer of output boxes with class and score. index : Buffer Buffer of number of valid output boxes. new_data : Buffer Buffer of flattened segmented data. new_index : Buffer Buffer of flattened segmented indices. loc : Buffer Buffer of start locations of each sorting segment. out_index : Buffer Output buffer of output box indexes sorted by score in a flattened segmented format. axis_mul_before : int The multiplication result of axis dimensions before axis. axis_mul_after : int The multiplication result of axis dimensions after axis. axis : int The axis used for sorting. is_descend : bool If the sorted data is in descending order. Returns ------- stmt : Stmt The result IR statement. """ max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib = tvm.ir_builder.create() dshape = loc.shape fshape = data.shape[axis] * dshape[0] temp_data = ib.allocate( "float32", dshape, name="temp_data", scope="local") p_data = ib.buffer_ptr(data) p_index = ib.buffer_ptr(index) data_new = ib.buffer_ptr(new_data) index_new = ib.buffer_ptr(new_index) index_out = ib.buffer_ptr(out_index) sizes = ib.buffer_ptr(loc) nthread_tx = max_threads nthread_bx = fshape // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(axis_mul_before * axis_mul_after > 1): with ib.if_scope(tid < axis_mul_before * axis_mul_after): with ib.if_scope(tid == 0): start = 0 with ib.else_scope(): start = sizes[tid-1] # OddEvenTransposeSort with ib.for_range(0, p_index[tid], name="k") as k: with ib.for_range(0, p_index[tid] - 1, name="i") as i: with ib.if_scope(i % 2 == k % 2): with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)): temp_data[tid] = data_new[i+start] data_new[i+start] = data_new[i+start+1] data_new[i+start+1] = temp_data[tid] index_out[tid] = index_new[i+start] index_new[i+start] = index_new[i+start+1] index_new[i+start+1] = index_out[tid] with ib.if_scope(tid < 1): with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i: index_out[i] = index_new[i] with ib.else_scope(): with ib.for_range(0, fshape, name="k", for_type="unroll") as k: with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)): with ib.if_scope(k % 2 == 0): with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \ == is_descend)): data_new[tid] = p_data[tid+1] index_out[tid] = index_new[tid+1] with ib.else_scope(): data_new[tid] = p_data[tid] index_out[tid] = index_new[tid] with ib.else_scope(): with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \ == is_descend)): p_data[tid] = data_new[tid+1] index_new[tid] = index_out[tid+1] with ib.else_scope(): p_data[tid] = data_new[tid] index_new[tid] = index_out[tid] with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)): with ib.if_scope(k % 2 == 0): with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)): data_new[tid] = p_data[tid-1] index_out[tid] = index_new[tid-1] with ib.else_scope(): data_new[tid] = p_data[tid] index_out[tid] = index_new[tid] with ib.else_scope(): with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \ == is_descend)): p_data[tid] = data_new[tid-1] index_new[tid] = index_out[tid-1] with ib.else_scope(): p_data[tid] = data_new[tid] index_new[tid] = index_out[tid] with ib.if_scope(fshape % 2 == 1): with ib.if_scope(tid < 1): with ib.for_range(0, fshape, name="k") as k: index_out[tid] = index_new[tid] body = ib.get() return body
def sort_ir(data, output, axis, is_ascend): """Low level IR to do nms sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU. Parameters ---------- data: Buffer Buffer of input data. output : Buffer Output buffer of indicies of sorted tensor with same shape as data. axis : Int Axis long which to sort the input tensor. is_ascend : Boolean Whether to sort in ascending or descending order. Returns ------- stmt : Stmt The result IR statement. """ size = 1 axis_mul_before = 1 axis_mul_after = 1 shape = data.shape if axis < 0: axis = len(shape) + axis for i, value in enumerate(shape, 0): size *= value if i < axis: axis_mul_before *= value elif i > axis: axis_mul_after *= value max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) output = ib.buffer_ptr(output) nthread_tx = max_threads nthread_bx = size // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("vthread") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "virtual_thread", nthread_bx) tid = bx * nthread_tx + tx temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local") temp_index = ib.allocate("float32", (1,), name="temp_index", scope="local") is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend) with ib.for_range(0, axis_mul_before) as i: with ib.for_range(0, axis_mul_after) as j: current_sort_num = shape[axis] base_idx = i * shape[axis] * axis_mul_after + j with ib.if_scope(tid < shape[axis]): output[base_idx + tid * axis_mul_after] = tid.astype("float32") # OddEvenTransposeSort with ib.for_range(0, current_sort_num) as k: with ib.if_scope(tid < (current_sort_num + 1) // 2): offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after with ib.if_scope(tvm.all(is_ascend == 1, \ 2 * tid + (k % 2) + 1 < current_sort_num, \ data[offset] > data[offset + axis_mul_after])): temp_data[0] = data[offset] data[offset] = data[offset + axis_mul_after] data[offset + axis_mul_after] = temp_data[0] temp_index[0] = output[offset] output[offset] = output[offset + axis_mul_after] output[offset + axis_mul_after] = temp_index[0] with ib.if_scope(tvm.all(is_ascend == 0, \ 2 * tid + (k % 2) + 1 < current_sort_num, \ data[offset] < data[offset + axis_mul_after])): temp_data[0] = data[offset] data[offset] = data[offset + axis_mul_after] data[offset + axis_mul_after] = temp_data[0] temp_index[0] = output[offset] output[offset] = output[offset + axis_mul_after] output[offset + axis_mul_after] = temp_index[0] ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) max_threads = int(math.sqrt( tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") by = tvm.thread_axis("blockIdx.y") ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 nthread_ty = max_threads nthread_by = 6 // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(ty, "thread_extent", nthread_ty) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) i = bx * max_threads + tx j = by * max_threads + ty nms_threshold_node = tvm.make.node( "FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node( "IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll", name="n") as n: with ib.if_scope( tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.select( tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), nms_topk, p_valid_count[n]) with ib.if_scope(i < nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + i * 6 + j)] = p_data[(n * num_anchors * 6 + p_sort_result[n * num_anchors + i] * 6 + j)] with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): with ib.if_scope(i < p_valid_count[n] - nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6 + (i + nkeep) * 6 + j)] # Apply nms with ib.if_scope(i < p_valid_count[n]): offset_i = i * 6 with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0): with ib.if_scope(j < p_valid_count[n]): offset_j = j * 6 with ib.if_scope(tvm.all(j > i, p_out[n * num_anchors * 6 + offset_j] >= 0)): with ib.if_scope(tvm.any(force_suppress_node > 0, p_out[n * num_anchors * 6 + offset_i] == p_out[n * num_anchors * 6 + offset_j])): # When force_suppress == True or class_id equals iou = calculate_overlap( p_out, n * num_anchors * 6 + offset_i + 2, n * num_anchors * 6 + offset_j + 2) with ib.if_scope(iou >= nms_threshold): p_out[ n * num_anchors * 6 + offset_j] = -1.0 with ib.else_scope(): with ib.if_scope(i < p_valid_count[n]): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j] # Set invalid entry to be -1 with ib.if_scope(i < num_anchors - p_valid_count[n]): with ib.if_scope(j < 6): p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0 body = ib.get() return body
def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold): """Low level IR routing for transform location data preparation. Parameters ---------- cls_prob : Buffer Buffer of class probabilities. valid_count : Buffer Buffer of number of valid output boxes. temp_flag : Buffer Output intermediate result buffer temp_id : Buffer Output intermediate result buffer temp_score_out : Buffer Output buffer threshold : float Threshold to be a positive prediction. Returns ------- stmt : Stmt The result IR statement. """ batch_size = cls_prob.shape[0] num_classes = cls_prob.shape[1] num_anchors = cls_prob.shape[2] max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) ib = tvm.ir_builder.create() score = ib.buffer_ptr(temp_score_out) cls_id = ib.buffer_ptr(temp_id) flag = ib.buffer_ptr(temp_flag) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") nthread_tx = max_threads nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx p_cls_prob = ib.buffer_ptr(cls_prob) p_valid_count = ib.buffer_ptr(valid_count) with ib.if_scope(tid < batch_size * num_anchors): n = tid / num_anchors # number of batches i = tid % num_anchors # number of anchors score[i] = -1.0 cls_id[i] = 0 p_valid_count[n] = 0 with ib.for_range(0, num_classes-1, name="k") as k: temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i] with ib.if_scope(temp > score[i]): cls_id[i] = k + 1 score[i] = temp with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)): cls_id[i] = 0 with ib.if_scope(cls_id[i] > 0): flag[i] = 1 with ib.else_scope(): flag[i] = 0 with ib.if_scope(tid < batch_size): with ib.for_range(0, num_anchors, name="k") as k: with ib.if_scope(k > 0): flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1] p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1] body = ib.get() return body
def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- cls_prob : Buffer Buffer of class probabilities. loc_pred : Buffer Buffer of location regression predictions. anchor : Buffer Buffer of prior anchor boxes. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. clip : boolean Whether to clip out-of-boundary boxes. threshold : float Threshold to be a positive prediction. variances : tuple of float Variances to be decoded from box regression output. Returns ------- stmt : Stmt The result IR statement. """ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh): """Transform prior anchor box to output box through location predictions. """ al = anchor[anchor_base_idx] at = anchor[anchor_base_idx + 1] ar = anchor[anchor_base_idx + 2] ab = anchor[anchor_base_idx + 3] aw = ar - al ah = ab - at ax = (al + ar) / 2.0 ay = (at + ab) / 2.0 px = loc[loc_base_idx] py = loc[loc_base_idx + 1] pw = loc[loc_base_idx + 2] ph = loc[loc_base_idx + 3] ox = px * vx * aw + ax oy = py * vy * ah + ay ow = tvm.exp(pw * vw) * aw / 2.0 oh = tvm.exp(ph * vh) * ah / 2.0 return tvm.select(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \ tvm.select(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \ tvm.select(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \ tvm.select(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh) batch_size = cls_prob.shape[0] num_classes = cls_prob.shape[1] num_anchors = cls_prob.shape[2] ib = tvm.ir_builder.create() p_cls_prob = ib.buffer_ptr(cls_prob) p_loc_pred = ib.buffer_ptr(loc_pred) p_anchor = ib.buffer_ptr(anchor) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) with ib.for_range(0, batch_size, for_type="parallel", name="n") as n: p_valid_count[n] = 0 with ib.for_range(0, num_anchors, name="i") as i: # Find the predicted class id and probability score = ib.allocate('float32', (1,), name="score", scope="local") cls_id = ib.allocate('int32', (1,), name="id", scope="local") score[0] = -1.0 cls_id[0] = 0 with ib.for_range(0, num_classes, name="j") as j: with ib.if_scope(j > 0): temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i] cls_id[0] = tvm.select(temp > score[0], j, cls_id[0]) score[0] = tvm.max(temp, score[0]) with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)): cls_id[0] = 0 # [id, prob, xmin, ymin, xmax, ymax] # Remove background, restore original id with ib.if_scope(cls_id[0] > 0): out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6 p_out[out_base_idx] = cls_id[0] - 1.0 p_out[out_base_idx + 1] = score[0] offset = i * 4 p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \ p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset, p_anchor, offset, clip, variances[0], variances[1], variances[2], variances[3]) p_valid_count[n] += 1 return ib.get()
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="parallel", name="n") as n: with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), nms_topk, p_valid_count[n]) with ib.for_range(0, nkeep, name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + l * 6 + m)] = p_data[(n * num_anchors * 6 + p_sort_result[n * num_anchors + l] * 6 + m)] with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6 + (l + nkeep) * 6 + m)] # Apply nms with ib.for_range(0, p_valid_count[n], name="l") as l: offset_l = l * 6 with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0): with ib.for_range(0, p_valid_count[n], name="m") as m: offset_m = m * 6 with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6 + offset_m] >= 0)): with ib.if_scope(tvm.any(force_suppress_node > 0, p_out[n * num_anchors * 6 + offset_l] == p_out[n * num_anchors * 6 + offset_m])): # When force_suppress == True or class_id equals iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2, n * num_anchors * 6 + offset_m + 2) with ib.if_scope(iou >= nms_threshold): p_out[n * num_anchors * 6 + offset_m] = -1.0 with ib.else_scope(): with ib.for_range(0, p_valid_count[n], name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m] # Set invalid entry to be -1 with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0 return ib.get()
def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sort_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.expr.Select(u <= 0.0, 0.0, i / u) batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local") max_threads = int(math.sqrt( tvm.target.current_target(allow_none=False).max_num_threads)) nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) k = bx * max_threads + tx iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold) top_k = tvm.make.node("IntImm", dtype="int32", value=top_k) coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll") as i: base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( \ tvm.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) with ib.for_range(0, nkeep) as j: with ib.if_scope(k < box_data_length): out[(base_idx + j * box_data_length + k)] = \ data[(base_idx + sorted_index[i * num_anchors + j] \ * box_data_length + k)] box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j] with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])): with ib.for_range(0, valid_count[i] - nkeep) as j: with ib.if_scope(k < box_data_length): out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0 box_indices[i * num_anchors + (j + nkeep)] = -1 # Apply nms with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(k < valid_count[i]): offset_k = k * box_data_length with ib.if_scope(tvm.all(k > j, out[base_idx + offset_k] >= 0, \ tvm.any(force_suppress > 0, id_index < 0, \ out[base_idx + offset_j] == \ out[base_idx + offset_k]))): iou = calculate_overlap(out, base_idx + offset_k + coord_start, base_idx + offset_j + coord_start) with ib.if_scope(iou >= iou_threshold): out[base_idx + offset_k] = -1.0 box_indices[i * num_anchors + k] = -1 ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) with ib.else_scope(): with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(k < box_data_length): out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k] box_indices[i * num_anchors + j] = j # Set invalid entry to be -1 with ib.for_range(0, num_anchors - valid_count[i]) as j: with ib.if_scope(k < box_data_length): out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0 box_indices[i * num_anchors + j + valid_count[i]] = -1 # Only return max_output_size number of valid boxes num_valid_boxes[0] = 0 with ib.if_scope(max_output_size > 0): with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(num_valid_boxes[0] == max_output_size): with ib.if_scope(k < box_data_length): out[base_idx + offset_j + k] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
in_channel = 256 out_channel = 512 in_size = 14 kernel = 3 pad = 1 stride = 1 # Algorithm A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A') W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W') out_size = (in_size - kernel + 2*pad) // stride + 1 # Pad input Apad = tvm.compute( (in_size + 2*pad, in_size + 2*pad, in_channel, batch), lambda yy, xx, cc, nn: tvm.select( tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size), A[yy - pad, xx - pad, cc, nn], tvm.const(0.)), name='Apad') # Create reduction variables rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel), name='ry') rx = tvm.reduce_axis((0, kernel), name='rx') # Compute the convolution B = tvm.compute( (out_size, out_size, out_channel, batch), lambda yy, xx, ff, nn: tvm.sum( Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc]), name='B')