# ~~~~~~~~~~~~ # One specificity of hardware accelerators, is that on-chip memory has to be # explicitly managed. # This means that we'll need to describe intermediate tensors :code:`A_buf` # and :code:`B_buf` that can have a different memory scope than the original # placeholder tensors :code:`A` and :code:`B`. # # Later in the scheduling phase, we can tell the compiler that :code:`A_buf` # and :code:`B_buf` will live in the VTA's on-chip buffers (SRAM), while # :code:`A` and :code:`B` will live in main memory (DRAM). # We describe A_buf and B_buf as the result of a compute # operation that is the identity function. # This can later be interpreted by the compiler as a cached read operation. # A copy buffer A_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf") # B copy buffer B_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf") ###################################################################### # Vector Addition # ~~~~~~~~~~~~~~~ # Now we're ready to describe the vector addition result tensor :code:`C`, # with another compute operation. # The compute function takes the shape of the tensor, as well as a lambda # function that describes the computation rule for each position of the tensor. # # No computation happens during this phase, as we are only declaring how # the computation should be done. # Describe the in-VTA vector addition
def test_buffer_broadcast_expr(): n0, m0, x = te.size_var('n0'), te.size_var('m0'), te.size_var('x') n1, m1 = te.size_var('n1'), te.size_var('m1') o0, o1 = te.size_var('o0'), te.size_var('o1') A = te.placeholder((m0, n0), name='A') B = te.placeholder((m1, n1), name='B') C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name='C') Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast") Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast") s = te.create_schedule(C.op) def check_stride(): if not tvm.runtime.enabled("llvm"): return fadd = tvm.build(s, [A, B, C, o1, x], target='llvm', name='bcast_add', binds={ A: Ab, B: Bb, C: Cc }) ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) def check_no_stride(): if not tvm.runtime.enabled("llvm"): return fadd = tvm.build(s, [A, B, C, o1, x], target='llvm', name='bcast_add', binds={ A: Ab, B: Bb, C: Cc }) ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) def check_auto_bind(): if not tvm.runtime.enabled("llvm"): return # Let build bind buffers fadd = tvm.build(s, [A, B, C, o1, x], target='llvm', name='bcast_add') ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_stride() check_no_stride() check_auto_bind()
def gemm_int8(n, m, l): A = te.placeholder((n, l), name="A", dtype="int8") B = te.placeholder((m, l), name="B", dtype="int8") k = te.reduce_axis((0, l), name="k") C = te.compute( (n, m), lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k), name="C", ) cfg = autotvm.get_config() s = te.create_schedule(C.op) y, x = C.op.axis AA = s.cache_read(A, "shared", [C]) BB = s.cache_read(B, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") k = CC.op.reduce_axis[0] cfg.define_split( "tile_k", cfg.axis(k), num_outputs=3, filter=lambda entity: entity.size[2] == 4 and entity.size[0] * 2 >= entity.size[1], ) ko, kt, ki = cfg["tile_k"].apply(s, CC, k) s[CC].tensorize(ki, intrin_dp4a) block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") def block_size_filter(entity): return (entity.size[0] * 2 >= entity.size[1] * 2 and entity.size[1] <= 16 and entity.size[3] <= 4) cfg.define_split("tile_y", cfg.axis(y), num_outputs=4, filter=block_size_filter) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4, filter=block_size_filter) by, tyz, ty, yi = cfg["tile_y"].apply(s, C, y) bx, txz, tx, xi = cfg["tile_x"].apply(s, C, x) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].bind(tyz, te.thread_axis("vthread")) s[C].bind(txz, te.thread_axis("vthread")) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis s[CC].reorder(ko, kt, yo, xo, ki) s[CC].unroll(kt) for stage in [AL, BL]: s[stage].compute_at(s[CC], kt) _, xi = s[stage].split(stage.op.axis[1], factor=4) s[stage].vectorize(xi) s[stage].double_buffer() cfg.define_knob("storage_align", [16, 48]) for stage in [AA, BB]: s[stage].storage_align(s[stage].op.axis[0], cfg["storage_align"].val, 0) s[stage].compute_at(s[CC], ko) fused = s[stage].fuse(*s[stage].op.axis) ty, tx = s[stage].split(fused, nparts=cfg["tile_y"].size[2]) tx, xi = s[stage].split(tx, nparts=cfg["tile_x"].size[2]) _, xi = s[stage].split(xi, factor=16) s[stage].bind(ty, thread_y) s[stage].bind(tx, thread_x) s[stage].vectorize(xi) cfg.define_knob("auto_unroll_max_step", [512, 1500]) s[C].pragma(by, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[C].pragma(by, "unroll_explicit", False) cfg.add_flop(n * m * l * 2) return s, [A, B, C]
def schedule_hwnc_tensorcore_cuda(cfg, s, Conv): """Schedule tensorcore template""" packed_data, packed_kernel = s[Conv].op.input_tensors ic, kh, kw, ii = s[Conv].op.reduce_axis pad_data = s[packed_data].op.input_tensors[0] block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") # Designate the memory hierarchy AS = s.cache_read(packed_data, "shared", [Conv]) WS = s.cache_read(packed_kernel, "shared", [Conv]) AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) ConvF = s.cache_write(Conv, "wmma.accumulator") if Conv.op in s.outputs: output = Conv ConvS = s.cache_read(ConvF, "shared", [Conv]) OL = ConvS else: output = s.outputs[0].output(0) s[Conv].set_scope("shared") OL = Conv out_dtype = Conv.dtype if isinstance( packed_kernel.op, te.tensor.ComputeOp) and packed_kernel.name == "packed_kernel": if autotvm.GLOBAL_SCOPE.in_tuning: s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region") else: with Target("cuda"): schedule_injective_from_existing(s, packed_kernel) if isinstance(pad_data.op, te.tensor.ComputeOp) and "pad" in pad_data.op.tag: s[pad_data].compute_inline() data = pad_data.op.input_tensors[0] if autotvm.GLOBAL_SCOPE.in_tuning: # skip this part during tuning to make recrods accurate # this part will be pre-computed during NNVM's pre-compute optimization pass s[pad_data].pragma(s[pad_data].op.axis[0], "debug_skip_region") else: data = pad_data s[data].compute_inline() data_dtype = data.dtype kernel_dtype = packed_kernel.dtype # Schedule for autotvm cfg.define_knob("block_row_warps", [1, 2, 4]) cfg.define_knob("block_col_warps", [1, 2, 4]) cfg.define_knob("warp_row_tiles", [1, 2, 4, 8, 16]) cfg.define_knob("warp_col_tiles", [1, 2, 4, 8, 16]) cfg.define_knob("chunk", [1, 2, 4, 8]) cfg.define_knob("fuse_pack", [0, 1]) cfg.define_knob("split_block_k_nums", [1, 2, 4, 8, 16, 32]) cfg.define_knob("vector_ws", [1, 8]) cfg.define_knob("vector_as", [1, 8, 16]) block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val warp_col_tiles = cfg["warp_col_tiles"].val chunk = cfg["chunk"].val vector_as = cfg["vector_as"].val vector_ws = cfg["vector_ws"].val split_block_k_nums = cfg["split_block_k_nums"].val fuse_pack = cfg["fuse_pack"].val if not fuse_pack: s[packed_data].compute_inline() else: with Target("cuda"): schedule_injective_from_existing(s, packed_data) if data_dtype in ["int4", "uint4"]: wmma_m = wmma_n = 8 wmma_k = 32 else: wmma_m = 8 wmma_n = 32 wmma_k = 16 warp_size = 32 # Schedule for output if len(s[output].op.axis) == 4: ( hc, wc, nc, oc, ) = output.op.axis nc, nnc = s[output].split(nc, factor=wmma_m) oc, ooc = s[output].split(oc, factor=wmma_n) else: hc, wc, nc, oc, nnc, ooc = output.op.axis kernel_scope, hc = s[output].split(hc, nparts=1) block_k = s[output].fuse(hc, wc) block_k, split_block_k = s[output].split(block_k, factor=split_block_k_nums) nc, nci = s[output].split(nc, factor=warp_row_tiles) block_i, nc = s[output].split(nc, factor=block_row_warps) oc, oci = s[output].split(oc, factor=warp_col_tiles) block_j, oc = s[output].split(oc, factor=block_col_warps) s[output].reorder(block_k, split_block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) t = s[output].fuse(nnc, ooc) _, tx = s[output].split(t, factor=warp_size) s[output].bind(block_k, block_z) s[output].bind(block_i, block_x) s[output].bind(block_j, block_y) s[output].bind(tx, thread_x) s[output].bind(nc, thread_y) s[output].bind(oc, thread_z) # Schedule wmma store s[OL].compute_at(s[output], block_j) hc, wc, nc, oc, nnc, ooc = OL.op.axis oc, oci = s[OL].split(oc, factor=warp_col_tiles) _, oc = s[OL].split(oc, factor=block_col_warps) nc, nci = s[OL].split(nc, factor=warp_row_tiles) _, nc = s[OL].split(nc, factor=block_row_warps) s[OL].reorder(nc, oc, nci, oci, nnc, ooc) s[OL].bind(nc, thread_y) s[OL].bind(oc, thread_z) # Schedule local computation s[ConvF].compute_at(s[OL], oc) _, _, n, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) cfg.define_reorder("reorder_inner", [ko, kh], policy="all") cfg["reorder_inner"].apply(s, ConvF, [ko, kh]) cfg["reorder_inner"].apply(s, ConvF, [ki, kw]) cfg.define_knob("compute_at_AS", [0, 1, 2, 3]) cfg.define_knob("compute_at_WS", [0, 1, 2, 3]) compute_at_AS = cfg["compute_at_AS"].val compute_at_WS = cfg["compute_at_WS"].val # Move intermediate computation into each output compute tile s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) # Schedule for A's share memory if compute_at_AS == 0: s[AS].compute_at(s[ConvF], ki) elif compute_at_AS == 1: s[AS].compute_at(s[ConvF], kw) elif compute_at_AS == 2: s[AS].compute_at(s[ConvF], ko) else: s[AS].compute_at(s[ConvF], kh) _, _, n, _, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, _ = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, nparts=warp_size) ti, _t = s[AS].split(ti, factor=vector_as) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(to, thread_x) s[AS].vectorize(_t) # Schedule for W's share memory if compute_at_WS == 0: s[WS].compute_at(s[ConvF], ki) elif compute_at_WS == 1: s[WS].compute_at(s[ConvF], kw) elif compute_at_WS == 2: s[WS].compute_at(s[ConvF], ko) else: s[WS].compute_at(s[ConvF], kh) s[WS].compute_at(s[ConvF], kw) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, _ = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) ti, _t = s[WS].split(ti, factor=vector_ws) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) # double buffer cfg.define_knob("AS_double_buffer", [0, 1]) cfg.define_knob("WS_double_buffer", [0, 1]) if cfg["AS_double_buffer"].val: s[AS].double_buffer() if cfg["WS_double_buffer"].val: s[WS].double_buffer() # unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[output].pragma(kernel_scope, "unroll_explicit", False) shape = (wmma_m, wmma_n, wmma_k) AS_shape = (wmma_m, wmma_k) AL_shape = (wmma_m, wmma_k) WS_shape = (wmma_n, wmma_k) WL_shape = (wmma_n, wmma_k) CL_shape = (wmma_m, wmma_n) CS_shape = (wmma_m, wmma_n) AL_gemm = te.placeholder(AL_shape, name="A", dtype=data_dtype) WL_gemm = te.placeholder(WL_shape, name="B", dtype=kernel_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k") CL_compute = te.compute( CL_shape, lambda ii, jj: te.sum((AL_gemm[ii, k_gemm].astype("int32") * WL_gemm[ jj, k_gemm].astype("int32")), axis=k_gemm), name="C", ) AL_strides = [wmma_k, 1] AS_strides = [wmma_k, 1] WL_strides = [wmma_k, 1] WS_strides = [wmma_k, 1] CL_strides = [wmma_n, 1] CS_strides = [wmma_n, 1] s[AF].tensorize( AF.op.axis[-2], intrin_wmma_load_matrix_A(AL_strides, AS_strides, shape, "row_major", AS_shape, AL_shape, data_dtype), ) s[WF].tensorize( WF.op.axis[-2], intrin_wmma_load_matrix_W(WL_strides, WS_strides, shape, "col_major", WS_shape, WL_shape, kernel_dtype), ) s[OL].tensorize( nnc, intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype, CL_shape, CS_shape)) s[ConvF].tensorize( nnf, intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides, CL_strides, shape), ) return s
def _intrin_popcount(m, k_i, w_b, x_b, unipolar): pack_dtype = "uint8" w = te.placeholder((w_b, m, k_i), dtype=pack_dtype, name="w") x = te.placeholder( ( x_b, k_i, ), dtype=pack_dtype, name="x", ) k = te.reduce_axis((0, k_i), name="k") bw = te.reduce_axis((0, w_b), name="bw") bx = te.reduce_axis((0, x_b), name="bx") if unipolar: dtype = "int16" z = te.compute( (m, ), lambda i: te.sum( (tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype( dtype)) - tvm.tir.popcount(~w[bw, i, k].astype(dtype) & x[ bx, k].astype(dtype))) << (bw + bx).astype(dtype), axis=[bw, bx, k], ), name="z", ) else: dtype = "uint16" z = te.compute( (m, ), lambda i: te.sum( tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype( dtype)) << (bw + bx).astype(dtype), axis=[bw, bx, k], ), name="z", ) Wb = tvm.tir.decl_buffer(w.shape, w.dtype, name="W", offset_factor=k_i, strides=[te.var("ldw"), te.var("ldw"), 1]) # stride can be inferred Xb = tvm.tir.decl_buffer(x.shape, x.dtype, name="X", offset_factor=k_i, strides=[te.var("ldw"), 1]) Zb = tvm.tir.decl_buffer(z.shape, z.dtype, name="Z", offset_factor=1, strides=[1]) def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] args_2 = tvm.tir.const(2, "uint32") if unipolar: vpadd = "llvm.arm.neon.vpadd.v8i8" vpadalu = "llvm.arm.neon.vpadals.v16i8.v8i16" full_dtype = "int8x16" half_dtype = "int8x8" return_dtype = "int16x8" else: vpadd = "llvm.arm.neon.vpadd.v8u8" vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" full_dtype = "uint8x16" half_dtype = "uint8x8" return_dtype = "uint16x8" def _instr(index): irb = tvm.tir.ir_builder.create() if index == 1: # reduce reset irb.emit(zz.vstore(0, tvm.tir.const(0, return_dtype))) return irb.get() # body and reduce update cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): w_ = ww.vload([bw, i, 0], "uint8x16").astype(full_dtype) x_ = xx.vload([bx, 0], "uint8x16").astype(full_dtype) if unipolar: cnts = tvm.tir.popcount( w_ & x_) - tvm.tir.popcount(~w_ & x_) else: cnts = tvm.tir.popcount(w_ & x_) upper_half = tvm.tir.call_intrin( half_dtype, "tir.vectorhigh", cnts) lower_half = tvm.tir.call_intrin( half_dtype, "tir.vectorlow", cnts) cnts8[i] = upper_half + lower_half for i in range(m // 2): cnts4[i] = tvm.tir.call_llvm_pure_intrin( half_dtype, vpadd, args_2, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.tir.call_llvm_pure_intrin( half_dtype, vpadd, args_2, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.tir.call_intrin(full_dtype, "tir.vectorcombine", cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.tir.const( bw + bx, pack_dtype) out = tvm.tir.call_llvm_pure_intrin( return_dtype, vpadalu, args_2, zz.vload(0, return_dtype), shifted_cnts) else: # ki == 8 for i in range(m): w_ = ww.vload([bw, i, 0], "uint8x8").astype(half_dtype) x_ = xx.vload([bx, 0], "uint8x8").astype(half_dtype) if unipolar: cnts8[i] = tvm.tir.popcount( w_ & x_) - tvm.tir.popcount(~w_ & x_) else: cnts8[i] = tvm.tir.popcount(w_ & x_) for i in range(m // 2): cnts4[i] = tvm.tir.call_llvm_pure_intrin( half_dtype, vpadd, args_2, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.tir.call_llvm_pure_intrin( half_dtype, vpadd, args_2, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.tir.call_intrin(full_dtype, "tir.vectorcombine", cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.tir.const( bw + bx, pack_dtype) out = tvm.tir.call_llvm_pure_intrin( return_dtype, vpadalu, args_2, zz.vload(0, return_dtype), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update return _instr(0), _instr(1), _instr(2) buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin(z.op, _intrin_func, binds={ w: Wb, x: Xb, z: Zb }, default_buffer_params=buffer_params)
def test_gemm(): # graph nn = 2048 n = te.var("n") n = tvm.runtime.convert(nn) m, l = n, n A = te.placeholder((l, n), name="A") B = te.placeholder((l, m), name="B") k = te.reduce_axis((0, l), name="k") C = te.compute((m, n), lambda ii, jj: te.sum(A[k, jj] * B[k, ii], axis=k), name="C") # schedule s = te.create_schedule(C.op) AA = s.cache_read(A, "shared", [C]) BB = s.cache_read(B, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") block_y = te.thread_axis("blockIdx.y") thread_y = te.thread_axis((0, num_thread), "threadIdx.y") thread_xz = te.thread_axis((0, 2), "vthread", name="vx") thread_yz = te.thread_axis((0, 2), "vthread", name="vy") by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_thread) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis ko, ki = s[CC].split(k, factor=8) kt, ki = s[CC].split(ki, factor=1) s[CC].reorder(ko, kt, ki, yo, xo) s[AA].compute_at(s[CC], ko) s[BB].compute_at(s[CC], ko) s[CC].unroll(kt) s[AL].compute_at(s[CC], kt) s[BL].compute_at(s[CC], kt) # Schedule for A's shared memory load ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) s[AA].vectorize(xi) # Schedule for B' shared memory load ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].vectorize(xi) s[AA].double_buffer() s[BB].double_buffer() # correctness def check_device(device): dev = tvm.device(device, 0) if not dev.exist: print("Skip because %s is not enabled" % device) return print("Device %s" % device) f = tvm.build(s, [A, B, C], device) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) for i in range(2): f(a, b, c) tvm.testing.assert_allclose(c.numpy(), np.dot(b_np.T, a_np), rtol=1e-5) num_flops = 2 * nn * nn * nn num_runs = 10 timer_f = f.time_evaluator(f.entry_name, dev, number=num_runs) t = timer_f(a, b, c).mean GFLOPS = num_flops / (t * 1e3) / 1e6 print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS)) for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]: with tvm.transform.PassContext( config={ "tir.UnrollLoop": { "auto_max_step": 128, "explicit_unroll": device != "cuda" } }): check_device(device)
def te_element_wise(): A = te.placeholder((128, 128), name="A") B = te.compute((128, 128), lambda x, y: A[x, y] * 2, name="B") C = te.compute((128, 128), lambda x, y: B[x, y] + 1, name="C") return [A, C]
def check_padded_load(pad_before, pad_after, test_name=None): # declare n = 3 m = 5 x = te.placeholder((n, m, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype) x_buf = topi.nn.pad(x, pad_before, pad_after, name="y") # insert no-op that won't be optimized away y_buf = te.compute( ( n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT, ), lambda *i: x_buf(*i) >> 0, "y_buf", ) y = te.compute( ( n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT, ), lambda *i: y_buf(*i).astype(env.inp_dtype), "y", ) # schedule s = te.create_schedule(y.op) s[x_buf].set_scope(env.acc_scope) s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy) s[y_buf].set_scope(env.acc_scope) s[y_buf].pragma(y_buf.op.axis[0], env.alu) s[y].pragma(y.op.axis[0], env.dma_copy) # build with vta.build_config(): mod = vta.build( s, [x, y], tvm.target.Target("ext_dev", host=env.target_host)) if not remote: return temp = utils.tempdir() mod.save(temp.relpath("padded_load.o")) remote.upload(temp.relpath("padded_load.o")) f = remote.load_module("padded_load.o") # verify dev = remote.ext_dev(0) x_np = np.random.randint(0, 10, size=(n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = np.zeros(( n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT, )).astype(y.dtype) y_np[pad_before[0]:pad_before[0] + n, pad_before[1]:pad_before[1] + m, :] = x_np x_nd = tvm.nd.array(x_np, dev) y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, y_nd) np.testing.assert_equal(y_np, y_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Padded {} load execution statistics:".format(test_name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _sparse_dense_sp_rhs_bsrmm(data, weight_data, weight_indices, weight_indptr, data_layout, weight_layout, output_layout): if data_layout == 'hwc': (m, k) = get_const_tuple(data.shape) elif data_layout == 'chw': (k, m) = get_const_tuple(data.shape) if weight_layout == 'oi': (nnz, bs_o, bs_i) = get_const_tuple(weight_data.shape) elif weight_layout == 'io': (nnz, bs_i, bs_o) = get_const_tuple(weight_data.shape) (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape) num_blocks = num_blocks_plus_1 - 1 def _compute_block_hwc(i, nb_j, j): row_start = weight_indptr[nb_j] row_end = weight_indptr[nb_j + 1] row_elems = row_end - row_start elem_idx = te.reduce_axis((0, row_elems), name="elem_idx") block_offset = row_start + elem_idx c = te.reduce_axis((0, bs_i), name="c") block_j = weight_indices[block_offset] if weight_layout == 'oi': block_ij_val = weight_data[block_offset][j][c] elif weight_layout == 'io': block_ij_val = weight_data[block_offset][c][j] if data_layout == 'hwc': x_val = data[i, bs_i * block_j + c] elif data_layout == 'chw': x_val = data[bs_i * block_j + c, i] return te.sum(block_ij_val * x_val, axis=[elem_idx, c]) def _compute_block_chw(nb_j, j, i): row_start = weight_indptr[nb_j] row_end = weight_indptr[nb_j + 1] row_elems = row_end - row_start elem_idx = te.reduce_axis((0, row_elems), name="elem_idx") block_offset = row_start + elem_idx c = te.reduce_axis((0, bs_i), name="c") block_j = weight_indices[block_offset] if weight_layout == 'oi': block_ij_val = weight_data[block_offset][j][c] elif weight_layout == 'io': block_ij_val = weight_data[block_offset][c][j] if data_layout == 'hwc': x_val = data[i, bs_i * block_j + c] elif data_layout == 'chw': x_val = data[bs_i * block_j + c, i] return te.sum(block_ij_val * x_val, axis=[elem_idx, c]) idxd = tvm.tir.indexdiv idxm = tvm.tir.indexmod if output_layout == 'hwc': bsrmm_block = te.compute( (m, num_blocks, bs_o), _compute_block_hwc, tag="sparse_dense_v2_block_hwc", attrs={"FLOP": 2 * m * nnz * bs_o * bs_i}, ) return te.compute( (m, num_blocks * bs_o), lambda m, n: bsrmm_block[m, idxd(n, bs_o), idxm(n, bs_o)], tag="sparse_dense_v2_hwc", ) elif output_layout == 'chw': bsrmm_block = te.compute( (num_blocks, bs_o, m), _compute_block_chw, tag="sparse_dense_v2_block_chw", attrs={"FLOP": 2 * m * nnz * bs_o * bs_i}, ) return te.compute( (num_blocks * bs_o, m), lambda n, m: bsrmm_block[idxd(n, bs_o), idxm(n, bs_o), m], tag="sparse_dense_v2_chw", )
def _run(env, remote): m = 8 n = 10 # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM max_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(a_buf(*i), 0), "res_buf") # relu min_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1), "max_buf", ) # relu res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: min_buf(*i).astype(env.inp_dtype), "min_buf", ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[max_buf].set_scope(env.acc_scope) # SRAM s[min_buf].set_scope(env.acc_scope) # SRAM s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build with vta.build_config(): mod = vta.build(s, [a, res], tvm.target.Target("ext_dev", host=env.target_host)) if not remote: return temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Relu execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _run(env, remote): m = 2 n = 8 imm_shift = np.random.randint(0, 8) imm_scale = np.random.randint(1, 5) # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM res_shift = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a_buf(*i) + imm_shift, "res_shift") # compute res_scale = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_shift(*i) >> imm_scale, "res_scale") # compute res = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_scale(*i).astype(env.inp_dtype), "res") # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[res_shift].set_scope(env.acc_scope) # SRAM s[res_scale].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build mod = vta.build(s, [a, res], tvm.target.Target("ext_dev", host=env.target_host)) if not remote: return temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.right_shift((a_np + imm_shift), imm_scale) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Shift and scale execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None): """Test ALU""" m = 8 n = 8 imm = np.random.randint(1, 5) # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM if use_imm: res_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf") # compute else: b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype) b_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf") # DRAM->SRAM res_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), b_buf(*i)), "res_buf", ) # compute5B res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_buf(*i).astype(env.inp_dtype), "res", ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_buf].set_scope(env.acc_scope) # SRAM s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM if not use_imm: s[b_buf].set_scope(env.acc_scope) # SRAM s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM if not remote: return # build with vta.build_config(): if use_imm: mod = vta.build( s, [a, res], tvm.target.Target("ext_dev", host=env.target_host)) else: mod = vta.build( s, [a, b, res], tvm.target.Target("ext_dev", host=env.target_host)) temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) else: b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() if use_imm: f(a_nd, res_nd) else: b_nd = tvm.nd.array(b_np, dev) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("ALU {} execution statistics:".format(test_name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _run(env, remote): # declare o = 4 n = 1 m = 4 x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype) w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype) x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf") w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf") ko = te.reduce_axis((0, n), name="ko") ki = te.reduce_axis((0, env.BLOCK_IN), name="ki") y_gem = te.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda bo, co, bi, ci: te.sum( x_buf[bo, ko, bi, ki].astype(env.acc_dtype) * w_buf[ co, ko, ci, ki].astype(env.acc_dtype), axis=[ko, ki], ), name="y_gem", ) y_shf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_gem(*i) >> 8, name="y_shf") y_max = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(y_shf(*i), 0), "y_max") # relu y_min = te.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.min(y_max(*i), (1 << (env.INP_WIDTH - 1)) - 1), "y_min", ) # relu y = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_min(*i).astype(env.inp_dtype), name="y") if not remote: return def verify(s, name=None): # Build with the CSE pass disabled as otherwise it would complicate the test with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): mod = vta.build( s, [x, w, y], tvm.target.Target("ext_dev", host=env.target_host)) temp = utils.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify dev = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, dev) w_nd = tvm.nd.array(w_np, dev) y_nd = tvm.nd.array(y_np, dev) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b, i, :] += np.dot( x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("GEMM schedule:{} execution statistics:".format(name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v)) def test_schedule1(): # default schedule with no smt s = te.create_schedule(y.op) # set the scope of the SRAM buffers s[x_buf].set_scope(env.inp_scope) s[w_buf].set_scope(env.wgt_scope) s[y_gem].set_scope(env.acc_scope) s[y_shf].set_scope(env.acc_scope) s[y_max].set_scope(env.acc_scope) s[y_min].set_scope(env.acc_scope) # set pragmas for DMA transfer and ALU ops s[x_buf].compute_at(s[y_gem], ko) s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy) s[w_buf].compute_at(s[y_gem], ko) s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy) s[y_shf].pragma(s[y_shf].op.axis[0], env.alu) s[y_max].pragma(s[y_max].op.axis[0], env.alu) s[y_min].pragma(s[y_min].op.axis[0], env.alu) s[y].pragma(s[y].op.axis[0], env.dma_copy) # tensorization s[y_gem].reorder( ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1], s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki, ) s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm) verify(s, name="default") def test_smt(): # test smt schedule s = te.create_schedule(y.op) s[x_buf].set_scope(env.inp_scope) s[w_buf].set_scope(env.wgt_scope) s[y_gem].set_scope(env.acc_scope) s[y_shf].set_scope(env.acc_scope) s[y_max].set_scope(env.acc_scope) s[y_min].set_scope(env.acc_scope) abo, aco, abi, aci = s[y].op.axis abo1, abo2 = s[y].split(abo, nparts=2) s[y].bind(abo1, te.thread_axis("cthread")) s[y_gem].compute_at(s[y], abo1) s[y_shf].compute_at(s[y], abo1) s[y_max].compute_at(s[y], abo1) s[y_min].compute_at(s[y], abo1) s[y_gem].reorder( ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1], s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki, ) s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm) s[y_shf].pragma(s[y_shf].op.axis[0], env.alu) s[y_max].pragma(s[y_max].op.axis[0], env.alu) s[y_min].pragma(s[y_min].op.axis[0], env.alu) s[x_buf].compute_at(s[y_gem], ko) s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy) s[w_buf].compute_at(s[y_gem], ko) s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy) s[y].pragma(abo2, env.dma_copy) verify(s, name="smt") test_schedule1() test_smt()
def _schedule_dense_tensorcore(cfg, s, C): """Schedule dense operator using Tensorcore""" A, B = s[C].op.input_tensors batch, out_dim = get_const_tuple(C.shape) out_dtype = C.dtype s[A].compute_inline() s[B].compute_inline() # Explicit memory access AS = s.cache_read(A, "shared", [C]) BS = s.cache_read(B, "shared", [C]) AF = s.cache_read(AS, "wmma.matrix_a", [C]) BF = s.cache_read(BS, "wmma.matrix_b", [C]) CF = s.cache_write(C, "wmma.accumulator") CS = s.cache_read(CF, "shared", [C]) # fallback support target = tvm.target.Target.current() if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log(target.kind.name, target.model, "dense_tensorcore.cuda") cfg.fallback_with_reference_log(ref_log) # Deal with op fusion, such as bias and relu if C.op not in s.outputs: s[C].compute_inline() C = s.outputs[0].output(0) # create tuning space cfg.define_knob("block_row_warps", [1, 2, 4]) cfg.define_knob("block_col_warps", [1, 2, 4]) cfg.define_knob("warp_row_tiles", [1, 2, 4]) cfg.define_knob("warp_col_tiles", [1, 2, 4]) cfg.define_knob("chunk", [1, 2, 4, 8]) cfg.define_knob("offset", [0, 8]) cfg.define_knob("offsetCS", [0, 8]) cfg.define_knob("vec", [1, 2, 4, 8]) # Ensure that the default parameters are applicable when autotvm is not in use if batch % 32 == 0 and out_dim % 8 == 0: cfg.define_knob("wmma_m", [32, 16, 8]) elif batch % 16 == 0 and out_dim % 16 == 0: cfg.define_knob("wmma_m", [16, 8, 32]) elif batch % 8 == 0 and out_dim % 32 == 0: cfg.define_knob("wmma_m", [8, 16, 32]) warp_size = 32 wmma_k = 16 block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val warp_col_tiles = cfg["warp_col_tiles"].val chunk = cfg["chunk"].val offset = cfg["offset"].val offsetCS = cfg["offsetCS"].val wmma_m = cfg["wmma_m"].val vec = cfg["vec"].val if wmma_m == 16: wmma_n = 16 elif wmma_m == 8: wmma_n = 32 elif wmma_m == 32: wmma_n = 8 # Define the stride of intrin functions AS_align = chunk * wmma_k + offset BS_align = chunk * wmma_k + offset CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS AS_stride = [AS_align, 1] BS_stride = [BS_align, 1] AF_stride = [wmma_k, 1] BF_stride = [wmma_k, 1] CF_stride = [warp_col_tiles * wmma_n, 1] CS_stride = [CS_align, 1] block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") # Schedule for dense computation block_factor_b = wmma_m * warp_row_tiles * block_row_warps block_factor_o = wmma_n * warp_col_tiles * block_col_warps b, o = C.op.axis block_i, bc = s[C].split(b, factor=block_factor_b) block_j, oc = s[C].split(o, factor=block_factor_o) s[C].reorder(block_i, block_j, bc, oc) t = s[C].fuse(bc, oc) t, vi = s[C].split(t, factor=vec) t, tx = s[C].split(t, factor=warp_size) t, ty = s[C].split(t, factor=block_row_warps) t, tz = s[C].split(t, factor=block_col_warps) s[C].bind(block_i, block_x) s[C].bind(block_j, block_y) s[C].bind(tz, thread_z) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].vectorize(vi) # Schedule for wmma store s[CS].compute_at(s[C], block_j) bb, oo = CS.op.axis s[CS].storage_align(bb, CS_align - 1, CS_align) bb, bbi = s[CS].split(bb, factor=wmma_m) oo, ooi = s[CS].split(oo, factor=wmma_n) bb, bbii = s[CS].split(bb, factor=warp_row_tiles) oo, ooii = s[CS].split(oo, factor=warp_col_tiles) s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi) # Schedule for wmma computation s[CF].compute_at(s[CS], oo) warp_i, warp_j = CF.op.axis warp_i, _ii = s[CF].split(warp_i, factor=wmma_m) warp_j, _jj = s[CF].split(warp_j, factor=wmma_n) (k, ) = CF.op.reduce_axis k, _k = s[CF].split(k, factor=wmma_k) ko, ki = s[CF].split(k, factor=chunk) s[CF].reorder(ko, ki, warp_i, warp_j, _ii, _jj, _k) # Schedule for wmma_matrix_a load s[AF].compute_at(s[CF], ki) b, i = AF.op.axis b, b_ii = s[AF].split(b, factor=wmma_m) i, i_jj = s[AF].split(i, factor=wmma_k) s[AF].reorder(b, i, b_ii, i_jj) # Schedule for wmma_matrix_b load s[BF].compute_at(s[CF], ki) o, i = BF.op.axis o, o_ii = s[BF].split(o, factor=wmma_n) i, i_ii = s[BF].split(i, factor=wmma_k) s[BF].reorder(o, i, o_ii, i_ii) # Schedule for A's(B's) shared memory load def shared_shedule(stage, strides): s[stage].compute_at(s[CF], ko) xo, yo = stage.op.axis s[stage].storage_align(xo, strides - 1, strides) t = s[stage].fuse(xo, yo) t, vi = s[stage].split(t, factor=vec) t, tx = s[stage].split(t, factor=warp_size) t, ty = s[stage].split(t, factor=block_row_warps) _, tz = s[stage].split(t, factor=block_col_warps) s[stage].bind(ty, thread_y) s[stage].bind(tz, thread_z) s[stage].bind(tx, thread_x) s[stage].vectorize(vi) shared_shedule(AS, AS_align) shared_shedule(BS, BS_align) shape = (wmma_m, wmma_n, wmma_k) in_dtype = "float16" AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype) BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm") CL_compute = te.compute( (wmma_m, wmma_n), lambda ii, jj: te.sum( AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].astype( out_dtype), axis=k_gemm, ), name="CL_compute", ) # lower the computation loops down to TensorCore hardware intrinsics # by mapping the dense tensorcore to tensor intrinsics s[AF].tensorize( b_ii, intrin_wmma_load_matrix_A(AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), "float16"), ) s[BF].tensorize( o_ii, intrin_wmma_load_matrix_W(BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), "float16"), ) s[CF].tensorize( _ii, intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape)) s[CS].tensorize( bbi, intrin_wmma_store_matrix(CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n)), )
def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): """ROI align operator in NCHW layout. Parameters ---------- data : tvm.te.Tensor 4-D with shape [batch, channel, height, width] rois : tvm.te.Tensor 2-D with shape [num_roi, 5]. The last dimension should be in format of [batch_index, w_start, h_start, w_end, h_end] pooled_size : int or list/tuple of two ints output size, or [out_height, out_width] spatial_scale : float Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal of total stride in convolutional layers, which should be in range (0.0, 1.0] sample_ratio : int Optional sampling ratio of ROI align, using adaptive size by default. Returns ------- output : tvm.te.Tensor 4-D with shape [num_roi, channel, pooled_size, pooled_size] """ dtype = rois.dtype _, channel, height, width = get_const_tuple(data.shape) num_roi, _ = get_const_tuple(rois.shape) if isinstance(pooled_size, int): pooled_size_h = pooled_size_w = pooled_size else: pooled_size_h, pooled_size_w = pooled_size def _bilinear(i, c, y, x): outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width) y = tvm.te.max(y, 0.0) x = tvm.te.max(x, 0.0) val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1) return tvm.tir.if_then_else(outside, 0.0, val) def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype("int32") roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[ 3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype)) roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const( sample_ratio, "int32") else: roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32") roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32") count = roi_bin_grid_h * roi_bin_grid_w rh = te.reduce_axis((0, roi_bin_grid_h)) rw = te.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w return te.sum( _bilinear( batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, ) / count, axis=[rh, rw], ) return te.compute((num_roi, channel, pooled_size_h, pooled_size_w), _sample, tag="pool,roi_align_nchw")
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(N, tvm.tir.Any): N = tvm.te.size_var("n") if not isinstance(IH, int) or not isinstance(IW, int): raise RuntimeError("ARM winograd conv2d doesn't support dynamic input height or width.") if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") idxd = tvm.tir.indexdiv idxm = tvm.tir.indexmod r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) K = CO C = CI H = (IH + pt + pb - 3) // HSTR + 1 W = (IW + pl + pr - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW # TODO(@kevinthesun): Support tuning/optimization for dynamic shape. tile_p = P if isinstance(N, int) else nH * nW cfg.define_split("tile_p", cfg.axis(tile_p), num_outputs=2, filter=lambda x: x.size[-1] <= 16) cfg.define_split("tile_k", cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16) VP = cfg["tile_p"].size[-1] VK = cfg["tile_k"].size[-1] # pack input tile input_tile = te.compute( (C, idxd(P, VP), alpha, alpha, VP), lambda c, b, eps, nu, bb: data_pad[ idxd(b * VP + bb, nH * nW), c, idxm(idxd(b * VP + bb, nW), nH) * m + eps, idxm(b * VP + bb, nW) * m + nu, ], name="d", ) if autotvm.GLOBAL_SCOPE.in_tuning: VC = cfg["tile_k"].size[-1] kvshape = (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC) U = tvm.te.placeholder(kvshape, kernel.dtype, name="U") else: # transform kernel if pre_computed: U = kernel else: r_kh = te.reduce_axis((0, KH), "r_kh") r_kw = te.reduce_axis((0, KW), "r_kw") U = te.compute( (alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk: te.sum( kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw], ), name="U", ) # transform image r_eps = te.reduce_axis((0, alpha), "r_eps") r_nu = te.reduce_axis((0, alpha), "r_nu") V = te.compute( (alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb: te.sum( input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu], ), name="V", ) # batch gemm c = te.reduce_axis((0, C), name="c") M = te.compute( (alpha, alpha, K, P), lambda eps, nu, k, b: te.sum( U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] * V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c, ), name="M", ) # inverse transform r_eps = te.reduce_axis((0, alpha), "r_eps") r_nu = te.reduce_axis((0, alpha), "r_nu") Y = te.compute( (K, P, m, m), lambda k, b, vh, vw: te.sum( M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu] ), name="Y", ) # unpack output output = te.compute( (N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m), idxm(h, m), idxm(w, m)], name="output", tag="winograd_conv2d_output", ) # we have to manually assign effective GFLOP for winograd if isinstance(N, int): cfg.add_flop(2 * N * K * H * W * KH * KW * C) return output
def conv2d_compute( ifm: te.Tensor, weight: te.Tensor, scale_bias: te.Tensor, lut: te.Tensor, ifm_scale: float, ifm_zero_point: int, weight_zero_point: int, ofm_scale: float, ofm_zero_point: int, strides: Tuple[int, int], padding: Tuple[int, int, int, int], dilation: Union[Tuple[int, int], List[int]], activation: str, clip_min: int, clip_max: int, rounding_mode: str, upscale: str, ifm_layout: str, ofm_layout: str, ) -> te.Tensor: """A compute operator representing the capabilities of a 2D convolution for the NPU. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). weight : te.Tensor The weight tensor. scale_bias : te.Tensor The packed per-channel weight scale and bias tensor. lut : te.Tensor The look-up table of values to use if activation = "LUT". ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. weight_zero_point : int The quantization zero point for the weight tensor. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. strides : tuple The 2 dimensional strides as (stride_height, stride_width). padding : tuple The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right). dilation : Union[Tuple[int, int], List[int]] The 2 dimensional dilation as (dilation_height, dilation_width). activation : str The activation function to use. "NONE" - no activation function. "CLIP" - clip the output between clip_min and clip_max. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. clip_min : int The minimum clipping value if activation = "CLIP". clip_max : int The maximum clipping value if activation = "CLIP". rounding_mode : str The rounding mode to apply to the Output Feature Map tensor. "TFL" - Tensorflow Lite rounding scheme. "TRUNCATE" - Truncate towards zero. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. upscale : str The 2x2 upscaling mode to apply to the Input Feature Map tensor. "NONE" - no upscaling. "NEAREST" - upscale using nearest neighbour. "ZEROS" - upscale using zeros. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. ifm_layout : str The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16". ofm_layout : str The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16". Returns ------- te.Tensor The OFM tensor. """ assert ifm.shape[0] == 1 assert ifm_layout in {"NHWC", "NHCWB16"} assert ofm_layout in {"NHWC", "NHCWB16"} stride_h, stride_w = strides dilation_h, dilation_w = dilation ofm_channels, kernel_h, kernel_w, ifm_channels = weight.shape # Compute operation for the IFM DMA pipeline dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale, weight.shape[3], padding) # 2D Convolution compute operation dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 ofm_height = (dmaed_ifm.shape[1] - dilated_kernel_h) // stride_h + 1 ofm_width = (dmaed_ifm.shape[2] - dilated_kernel_w) // stride_w + 1 rc = te.reduce_axis((0, ifm_channels), name="rc") rh = te.reduce_axis((0, kernel_h), name="ry") rw = te.reduce_axis((0, kernel_w), name="rx") conv2d_attrs = { "op": "ethosu_conv2d", "weight_zero_point": weight_zero_point, "activation": activation, "upscale": upscale, "clip_min": clip_min, "clip_max": clip_max, "rounding_mode": rounding_mode, "stride_h": stride_h, "stride_w": stride_w, "dilation_h": dilation_h, "dilation_w": dilation_w, } # This is a trick to insert the LUT tensor into the TE graph if LUT is present lut_expr = (lut[0] + lut[255]).astype(ifm.dtype) if activation in ("TANH", "LUT") else 0 # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT if activation in ("TANH", "LUT"): conv2d_attrs["lut"] = lut conv = te.compute( (1, ofm_height, ofm_width, ofm_channels), lambda nn, hh, ww, cc: te.sum( dmaed_ifm(nn, hh * stride_h + rh * dilation_h, ww * stride_w + rw * dilation_w, rc).astype(ifm.dtype) * weight[ cc, rh, rw, rc].astype(ifm.dtype) # This is a trick to load 10 elements of the scale_bias at once, not accurate maths + (scale_bias[cc, 0] * scale_bias[cc, 9] + lut_expr).astype(ifm. dtype), axis=[rh, rw, rc], ), name="ethosu_conv2d", attrs=conv2d_attrs, ) # Compute operation for the OFM DMA pipeline return dma_ofm_compute(conv, ofm_layout, ofm_zero_point, ofm_scale, ofm_channels)
def _conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): out_dtype = data.dtype if out_dtype is None else out_dtype assert isinstance(dilation, int) or len(dilation) == 3 if isinstance(dilation, int): dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) else: dilation_d, dilation_h, dilation_w = dilation DSTR, HSTR, WSTR = strides batch_size, in_channel, in_depth, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_depth, kernel_height, kernel_width = get_const_tuple(kernel.shape) dilated_kernel_d = (kernel_depth - 1) * dilation_d + 1 dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d( padding, (dilated_kernel_d, dilated_kernel_h, dilated_kernel_w) ) pad_d = pad_front + pad_back pad_h = pad_top + pad_down pad_w = pad_left + pad_right pad_depth = in_depth + pad_d pad_height = in_height + pad_h pad_width = in_width + pad_w out_depth = simplify((in_depth + pad_d - dilated_kernel_d) // DSTR + 1) out_height = simplify((in_height + pad_h - dilated_kernel_h) // HSTR + 1) out_width = simplify((in_width + pad_w - dilated_kernel_w) // WSTR + 1) # pack data DOPAD = pad_d != 0 or pad_h != 0 or pad_w != 0 if DOPAD: data_pad = pad( data, (0, 0, pad_front, pad_top, pad_left), (0, 0, pad_back, pad_down, pad_right), name="data_pad", ) else: data_pad = data # fetch schedule ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] shape = (batch_size, in_channel // ic_bn, pad_depth, pad_height, ic_bn, pad_width) data_vec = te.compute( shape, lambda n, C, d, h, c, w: data_pad[n, C * ic_bn + c, d, h, w], name="data_vec" ) # pack kernel shape = ( num_filter // oc_bn, in_channel // ic_bn, kernel_depth, kernel_height, kernel_width, ic_bn, oc_bn, ) kernel_vec = te.compute( shape, lambda CO, CI, d, h, w, ci, co: kernel[CO * oc_bn + co, CI * ic_bn + ci, d, h, w], name="kernel_vec", ) # convolution oshape = (batch_size, num_filter // oc_bn, out_depth, out_height, out_width, oc_bn) unpack_shape = (batch_size, num_filter, out_depth, out_height, out_width) ic = te.reduce_axis((0, in_channel), name="ic") kh = te.reduce_axis((0, kernel_height), name="kh") kw = te.reduce_axis((0, kernel_width), name="kw") kd = te.reduce_axis((0, kernel_depth), name="kd") idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv conv = te.compute( oshape, lambda n, oc_chunk, od, oh, ow, oc_block: te.sum( data_vec[ n, idxdiv(ic, ic_bn), od * DSTR + kd * dilation_d, oh * HSTR + kh * dilation_h, idxmod(ic, ic_bn), ow * WSTR + kw * dilation_w, ].astype(out_dtype) * kernel_vec[ oc_chunk, idxdiv(ic, ic_bn), kd, kh, kw, idxmod(ic, ic_bn), oc_block ].astype(out_dtype), axis=[ic, kd, kh, kw], ), name="conv", ) conv_unpacked = te.compute( unpack_shape, lambda n, c, d, h, w: conv[n, idxdiv(c, oc_bn), d, h, w, idxmod(c, oc_bn)].astype( out_dtype ), name="output_unpack", tag="conv3d_ncdhw", ) return conv_unpacked
def te_func(): a = te.placeholder((), name="a", dtype="int32") b = te.placeholder((), name="b", dtype="int32") c = te.compute(a.shape, lambda *i: a(*i) + b(*i), name="c") return [a, b, c]
def conv1d_ncw(data, kernel, strides=1, padding='VALID', dilation=1, out_dtype=None): """ 1D convolution forward operator for NCW layout. Parameters ---------- data : tvm.te.Tensor 3-D with shape [batch, in_channel, in_width] kernel : tvm.te.Tensor 3-D with shape [num_filter, in_channel, filter_size] strides : int or tuple The spatial stride along width padding : int, tuple, or str Padding size can be an integer for equal padding, a tuple of (left, right) or a string in ['VALID', 'SAME']. dilation : int or tuple Dilation rate if convolution should be dilated. out_dtype : str The output data type. If None then output is same type as input. """ if out_dtype is None: out_dtype = data.dtype if isinstance(strides, (tuple, list)): strides = strides[0] if isinstance(dilation, (tuple, list)): dilation = dilation[0] batch, in_channels, data_width = data.shape out_channels, _, kernel_size = kernel.shape # Compute the output shape dilated_kernel_size = (kernel_size - 1) * dilation + 1 pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, )) out_channels = simplify(out_channels) out_width = simplify( (data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1) # Apply padding pad_before = [0, 0, pad_left] pad_after = [0, 0, pad_right] temp = pad(data, pad_before, pad_after, name='pad_temp') # Compute graph rc = te.reduce_axis((0, in_channels), name='rc') rw = te.reduce_axis((0, kernel_size), name='rw') return te.compute( (batch, out_channels, out_width), lambda b, c, w: te.sum(temp[b, rc, w * strides + rw * dilation].astype( out_dtype) * kernel[c, rc, rw].astype(out_dtype), axis=[rc, rw]), tag="conv1d_ncw")
def get_template_op(**kwargs): if 'COMPUTE_V1' not in os.environ: raise Exception("Environment variable `COMPUTE_V1` is not set") program = os.environ['COMPUTE_V1'].strip() assert program.startswith('- '), "The computing expression doesn't start with proper prefix: - ..." global placeholders, output_saver placeholders, output_saver = {}, {"outputs": []} program = program[2:].strip() if program: exec('import tvm; from tvm import topi; ' + program, globals()) inputs = sorted(list(placeholders.values()), key=lambda x: x.name) outputs = sorted(output_saver["outputs"], key=lambda x: x.op.name) anno, options = program.find('## @'), [] if anno >= 0: program, options = program[:anno].strip(), program[program.index(':', anno) + 1:].strip().split('|') if len(outputs) > 1: def to_list(shape): return [int(d) for d in shape] for i in range(1, len(outputs)): assert to_list(outputs[0].shape) == to_list(outputs[i].shape), "Shape sizes for multiple outputs should be equal: %s v.s. %s" % (to_list(outputs[0].shape), to_list(outputs[i].shape)) outputs = te.compute(outputs[0].shape, lambda *X: [v[X] for v in outputs], name=intermediate_output) sch = te.create_schedule([outputs[i].op for i in range(len(outputs))]) def get_device_props(): props = tvm.runtime.ndarray.gpu(0) with open('%s/device_properties.cfg' % os.environ['ANTARES_DRIVER_PATH'], 'r') as fp: mem_bandwith = [] while True: line = fp.readline() if not line: break key, val = line.split(': ') if key in ('GlobalMemoryBusWidth', 'MemoryClockRate'): mem_bandwith.append(float(val)) mem_bandwith = 'inf' if not mem_bandwith else np.product(mem_bandwith) * 2.5e-7 props.mem_bandwith = float(mem_bandwith) return props if not hasattr(AntaresGlobal, 'auto_config'): AntaresGlobal.auto_config = AutoConfig() def _callback(explicit_ops): attrs = Mock() attrs.device_props = get_device_props() attrs.inputs = list(inputs) attrs.outputs = list(outputs) attrs.explicit_ops = explicit_ops attrs.scheduler = sch attrs.auto_config = AntaresGlobal.auto_config attrs.backend = backend attrs.ir = program attrs.options = options attrs.blend = '' attrs.get_extent = lambda axis: int(axis.dom.extent) def get_lower(): return str(tvm.lower(sch, attrs.inputs + attrs.outputs, simple_mode=True)).split('#[metadata]')[0] attrs.get_lower = get_lower AntaresGlobal.attrs = attrs do_native_scheduling(attrs) traverse_inline(sch, outputs[0].op, _callback) return sch, AntaresGlobal.attrs.inputs + AntaresGlobal.attrs.outputs
def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"): """ Compute group conv2d with NCHW layout, using GSPC algorithm. https://arxiv.org/abs/2006.09791 """ assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation, dilation else: dilation_h, dilation_w = dilation assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding elif len(padding) == 2: hpad, wpad = padding pad_top, pad_bottom = hpad, hpad pad_left, pad_right = wpad, wpad else: pad_top, pad_left, pad_bottom, pad_right = padding hpad = pad_top + pad_bottom wpad = pad_left + pad_right assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): stride_h, stride_w = strides, strides else: stride_h, stride_w = strides batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, kernel_depth, k_height, k_width = get_const_tuple( kernel.shape) pad_height = in_height + pad_top + pad_bottom pad_width = in_width + pad_left + pad_right dilated_kernel_h = (k_height - 1) * dilation_h + 1 dilated_kernel_w = (k_width - 1) * dilation_w + 1 out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 kernels_per_group = out_channel // groups cfg.define_split("tile_ic", in_channel, num_outputs=2) cfg.define_split("tile_oc", out_channel, num_outputs=2) cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. if cfg.is_fallback: _get_default_config( cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), te.placeholder( (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype), strides, padding, groups, out_dtype, ) oc_bn = cfg["tile_oc"].size[-1] ic_bn = cfg["tile_ic"].size[-1] # pack data DOPAD = hpad != 0 or wpad != 0 if DOPAD: data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad") else: data_pad = data shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) data_vec = te.compute( shape, lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], name="data_vec", ) # pack kernel shape = ( groups, kernels_per_group // oc_bn, kernel_depth // ic_bn, k_height, k_width, ic_bn, oc_bn, ) kernel_vec = te.compute( shape, lambda g, out_channel, in_channel, h, w, ci, co: kernel[ (out_channel * oc_bn + co + g * kernels_per_group ), in_channel * ic_bn + ci, h, w], name="kernel_vec", ) # convolution oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, out_channel, out_height, out_width) ic = te.reduce_axis((0, (kernel_depth)), name="ic") kh = te.reduce_axis((0, k_height), name="kh") kw = te.reduce_axis((0, k_width), name="kw") idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv conv = te.compute( oshape, lambda g, n, oc_chunk, oh, ow, oc_block: te.sum( data_vec[g, n, idxdiv(ic, ic_bn), oh * stride_h + kh * dilation_h, idxmod(ic, ic_bn), ow * stride_w + kw * dilation_w, ]. astype(out_dtype) * kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype( out_dtype), axis=[ic, kh, kw], ), name="conv", ) unpack = te.compute( unpack_shape, lambda n, c, h, w: conv[ idxdiv(c, kernels_per_group), n, idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), h, w, idxmod(idxmod(c, oc_bn), kernels_per_group), ].astype(out_dtype), name="output_unpack", tag="group_conv2d_nchw", ) return unpack
def hwnc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtype="int32"): """Compute declaration for tensorcore""" assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation in_dtype = Input.dtype if in_dtype in ["int4", "uint4"]: wmma_n = wmma_m = 8 wmma_k = 32 else: wmma_m = 8 wmma_n = 32 wmma_k = 16 pre_computed = len(Filter.shape) == 6 in_height, in_width, batch, in_channels = get_const_tuple(Input.shape) if pre_computed: kernel_h, kernel_w, oc_chunk, _, oc_block_factor, _ = get_const_tuple( Filter.shape) num_filter = oc_block_factor * oc_chunk else: kernel_h, kernel_w, num_filter, _ = get_const_tuple(Filter.shape) if in_dtype in ["int4", "uint4"]: assert batch % 8 == 0 and in_channels % 32 == 0 and num_filter % 8 == 0 else: assert batch % 8 == 0 and in_channels % 16 == 0 and num_filter % 32 == 0, ( "The shape of (batch, in_channels, num_filter) " "must be multiple of (8, 16, 32) for int8, " "and (8, 32, 8) for int4") # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channels = num_filter out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) cfg.add_flop(2 * batch * out_height * out_width * out_channels * in_channels * kernel_h * kernel_w) # Input feature map: (H, W, N, IC, n, ic) data_shape = (in_height, in_width, batch // wmma_m, in_channels // wmma_k, wmma_m, wmma_k) # Kernel: (H, W, OC, IC, oc, ic) kernel_shape = ( kernel_h, kernel_w, out_channels // wmma_n, in_channels // wmma_k, wmma_n, wmma_k, ) # Reduction axes kh = te.reduce_axis((0, kernel_h), name="kh") kw = te.reduce_axis((0, kernel_w), name="kw") ic = te.reduce_axis((0, in_channels // wmma_k), name="ic") ii = te.reduce_axis((0, wmma_k), name="ii") if pre_computed: packed_kernel = Filter else: packed_kernel = te.compute( kernel_shape, lambda kh, kw, o, i, oo, ii: Filter[kh, kw, o * wmma_n + oo, i * wmma_k + ii], name="packed_kernel", ) packed_data = te.compute( data_shape, lambda h, w, n, i, nn, ii: Input[h, w, n * wmma_m + nn, i * wmma_k + ii]) pad_before = [pad_top, pad_left, 0, 0, 0, 0] pad_after = [pad_down, pad_right, 0, 0, 0, 0] pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") Conv = te.compute( (out_height, out_width, batch // wmma_m, out_channels // wmma_n, wmma_m, wmma_n), lambda h, w, n, o, nn, oo: te.sum( (pad_data[h * stride_h + kh, w * stride_w + kw, n, ic, nn, ii]. astype("int32") * packed_kernel[kh, kw, o, ic, oo, ii].astype( "int32")), axis=[ic, kh, kw, ii], ), name="Conv", tag="conv2d_HWNCnc_tensorcore", ) return Conv
def conv2d_winograd_nhwc_auto_scheduler_test(N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1): tile_size = 4 inputs = te.placeholder((N, H, W, CI), name="inputs") N, H, W, CI = get_const_tuple(inputs.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" KH = KW = kernel_size HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW)) HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride assert HSTR == 1 and WSTR == 1 and KH == KW data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad") r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, "float32") H = (H + 2 * HPAD - KH) // HSTR + 1 W = (W + 2 * WPAD - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW r_kh = te.reduce_axis((0, KH), name="r_kh") r_kw = te.reduce_axis((0, KW), name="r_kw") kshape = (alpha, alpha, CI, CO) kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight") idxdiv = te.indexdiv idxmod = te.indexmod # pack input tile input_tile = te.compute( (alpha, alpha, P, CI), lambda eps, nu, p, ci: data_pad[idxdiv(p, (nH * nW))][idxmod( idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu][ci], name="input_tile", ) # transform data r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") data_pack = te.compute( (alpha, alpha, P, CI), lambda eps, nu, p, ci: te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name="data_pack", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"] }, ) # do batch gemm ci = te.reduce_axis((0, CI), name="ci") bgemm = te.compute( (alpha, alpha, P, CO), lambda eps, nu, p, co: te.sum(data_pack[eps][nu][p][ci] * kernel_pack[ eps][nu][ci][co], axis=[ci]), name="bgemm", ) # inverse transform r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") inverse = te.compute( (m, m, P, CO), lambda vh, vw, p, co: te.sum( bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name="inverse", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"] }, ) # output output = te.compute( (N, H, W, CO), lambda n, h, w, co: inverse[idxmod(h, m), idxmod(w, m), n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), co], name="conv2d_winograd", ) return [inputs, kernel_pack, output]
def bitserial_conv2d_nhwc( cfg, data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype, out_dtype, unipolar, ): """ Compute convolution with pack on spatial axes. """ assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" assert pack_dtype == "uint8", "only support packing into uint8 bits" assert out_dtype == "int16", "only support output type of int16" N, H, W, CI = get_const_tuple(data.shape) if len(kernel.shape) == 4: KH, KW, _, CO = get_const_tuple(kernel.shape) CI_packed = CI // 8 else: KH, KW, KB, CI_packed, CO = get_const_tuple(kernel.shape) if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) else: TPAD, LPAD, DPAD, RPAD = padding if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH - 1, KW - 1 PAD_H = H + (TPAD + DPAD) PAD_W = W + (LPAD + RPAD) OH = (PAD_H - KH) // HSTR + 1 OW = (PAD_W - KW) // WSTR + 1 oshape = (1, OH, OW, CO) idxd = tvm.tir.indexdiv idxm = tvm.tir.indexmod # Pad input channels of weights and data when it is not a multiple of 8 if CI_packed % 8 != 0: CI_PAD = CI_packed % 8 CI_packed += CI_PAD else: CI_PAD = 0 # ==================== define configuration space ==================== n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO) ci, kh, kw = cfg.reduce_axis(CI_packed), cfg.reduce_axis( KH), cfg.reduce_axis(KW) ib, kb = cfg.reduce_axis(activation_bits), cfg.reduce_axis(weight_bits) co, vc = cfg.define_split("tile_co", co, num_outputs=2, filter=lambda x: x.size[-1] == 8) oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2, filter=lambda x: x.size[-1] >= 2) ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda x: x.size[-1] >= 2) ci_o, ci_i = cfg.define_split( "tile_ci", ci, num_outputs=2, filter=lambda x: x.size[-1] == 8 or x.size[-1] == 16) re_axes = cfg.define_reorder( "reorder_0", [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i], policy="candidate", candidate=[ [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i], [n, oh, ow, co, vh, vw, kw, kh, ci_o, kb, ib, vc, ci_i], ], ) # binary ops cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype)) # ==================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type="uint8") kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC, len(kernel.shape) == 4) idxm = tvm.tir.indexmod if idxm(kernel_vec.shape[-1], 8) != 0 and CI_PAD != 0: kernel_vec = pad(kernel_vec, [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, CI_PAD]) N, H, W, IB, CI = data_q.shape OCO, KH, KW, KB, VC, CI = kernel_vec.shape dvshape = ( N, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT, VW * WSTR + WCAT, IB, CI, ) ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) if TPAD != 0 and RPAD != 0: data_pad = pad(data_q, (0, TPAD, LPAD, 0, 0), (0, DPAD, RPAD, 0, CI_PAD), name="data_pad") elif CI_PAD != 0: data_pad = pad(data_q, (0, 0, 0, 0, 0), (0, 0, 0, 0, CI_PAD), name="data_pad") else: data_pad = data_q data_vec = te.compute( dvshape, lambda n, h, w, vh, vw, b, ci: data_pad[n][h * VH * HSTR + vh][ w * VW * WSTR + vw][b][ci], name="data_vec", ) ci = te.reduce_axis((0, CI), name="ci") dh = te.reduce_axis((0, KH), name="dh") dw = te.reduce_axis((0, KW), name="dw") ib = te.reduce_axis((0, IB), name="ib") kb = te.reduce_axis((0, KB), name="kb") def _bipolar_conv(n, h, w, co, vh, vw, vc): return te.sum( (tvm.tir.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype("uint16") & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci].astype("uint16")) << (kb + ib).astype("uint16")), axis=[dh, dw, kb, ib, ci], ) def _unipolar_conv(n, h, w, co, vh, vw, vc): return te.sum( ((tvm.tir.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype("int16") & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci].astype("int16")) - tvm. tir.popcount(~kernel_vec[co, dh, dw, kb, vc, ci].astype("int16") & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci]).astype("int16")) << (kb + ib).astype("int16")), axis=[dh, dw, kb, ib, ci], ) if unipolar: conv_vec = te.compute(ovshape, _unipolar_conv, name="conv_vec", tag="unipolar") else: conv_vec = te.compute(ovshape, _bipolar_conv, name="conv_vec", tag="bipolar") conv = te.compute( oshape, lambda n, h, w, co: conv_vec[n, idxd(h, VH), idxd(w, VW), idxd(co, VC), idxm(h, VH), idxm(w, VW), idxm(co, VC)].astype(out_dtype), name="conv", tag="spatial_bitserial_conv_nhwc", ) return conv
def test_rpc_remote_module(): if not tvm.runtime.enabled("rpc"): return # graph n = tvm.runtime.convert(102) A = te.placeholder((n, ), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") s = te.create_schedule(B.op) server0 = rpc.Server("localhost", key="x0") server1 = rpc.Server("localhost", key="x1") client = rpc.connect( server0.host, server0.port, key="x0", session_constructor_args=[ "rpc.Connect", server1.host, server1.port, "x1" ], ) def check_remote(remote): temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Download the file from the remote path_tar = temp.relpath("dev_lib.tar") f.export_library(path_tar) remote.upload(path_tar) local_download_path = temp.relpath("dev_lib.download.so") with open(local_download_path, "wb") as fo: fo.write(remote.download_linked_module("dev_lib.tar")) fupdated = tvm.runtime.load_module(local_download_path) a = tvm.nd.array( np.random.uniform(size=102).astype(A.dtype), tvm.cpu(0)) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), tvm.cpu(0)) fupdated(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_minrpc(): if tvm.get_global_func("rpc.CreatePipeClient", allow_missing=True) is None: return # export to minrpc temp = util.tempdir() f = tvm.build(s, [A, B], "llvm --system-lib", name="myadd") path_minrpc = temp.relpath("dev_lib.minrpc") f.export_library(path_minrpc, rpc.with_minrpc(cc.create_executable)) with pytest.raises(RuntimeError): rpc.PopenSession("filenotexist") # statrt the minrpc session. remote = tvm.rpc.PopenSession(path_minrpc) ctx = remote.cpu(0) f1 = remote.system_lib() a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1) cost = time_f(a, b).mean np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # change to not executable os.chmod(path_minrpc, stat.S_IRUSR) with pytest.raises(RuntimeError): rpc.PopenSession(path_minrpc) def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.testing.device_enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[B].bind(xi, te.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote(rpc.LocalSession()) check_remote(client) check_minrpc()
def non_max_suppression( data, valid_count, indices, max_output_size=-1, iou_threshold=0.5, force_suppress=False, top_k=-1, coord_start=2, score_index=1, id_index=0, return_indices=True, invalid_to_bottom=False, ): """Non-maximum suppression operator for object detection. Parameters ---------- data : tvm.te.Tensor 3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5]. valid_count : tvm.te.Tensor 1-D tensor for valid number of boxes. indices : tvm.te.Tensor 2-D tensor with shape [batch_size, num_anchors]. max_output_size : optional, int or tvm.te.Tensor Max number of output valid boxes for each instance. Return all valid boxes if the value of max_output_size is less than 0. iou_threshold : optional, float Non-maximum suppression threshold. force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. coord_start : required, int Start index of the consecutive 4 coordinates. score_index: optional, int Index of the scores/confidence of boxes. id_index : optional, int index of the class categories, -1 to disable. return_indices : optional, boolean Whether to return box indices in input data. invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. Returns ------- out : tvm.te.Tensor or tuple of tvm.te.Tensor 3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5]. Out is a tuple of tvm.te.Tensor if return_indices is True, the Tensor in the tuple is 2-D tensor with shape [batch_size, num_anchors] and shape [batch_size, num_valid_anchors] respectively. Example -------- .. code-block:: python # An example to use non_max_suppression dshape = (1, 5, 6) data = te.placeholder(dshape, name="data") valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count") iou_threshold = 0.7 force_suppress = True top_k = -1 out = non_max_suppression(data, valid_count, indices, iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] if isinstance(max_output_size, int): max_output_size = tvm.tir.const(max_output_size, dtype="int32") score_axis = score_index score_shape = (batch_size, num_anchors) score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis]) sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False) out, box_indices = hybrid_nms( data, sort_tensor, valid_count, indices, batch_size, num_anchors, max_output_size, tvm.tir.const(iou_threshold, dtype=data.dtype), tvm.tir.const(force_suppress, dtype="bool"), tvm.tir.const(top_k, dtype="int32"), tvm.tir.const(coord_start, dtype="int32"), tvm.tir.const(score_index, dtype="int32"), tvm.tir.const(id_index, dtype="int32"), tvm.tir.const(return_indices, dtype="bool"), zero=tvm.tir.const(0, dtype=data.dtype), one=tvm.tir.const(1, dtype=data.dtype), ) if return_indices: return hybrid_rearrange_indices_out( box_indices, one=tvm.tir.const(1, dtype="int32"), batch_size=batch_size, num_anchors=num_anchors, ) if invalid_to_bottom: out = hybrid_rearrange_box_out( out, one=tvm.tir.const(1, dtype=data.dtype), batch_size=batch_size, num_anchors=num_anchors, ) return out
def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket): size = 128 outer_shape = (size, ) factor = 16 inner_shape = (factor, ) dtype = "int8" x = te.placeholder(shape=outer_shape, dtype=dtype, name="x") y = te.placeholder(shape=outer_shape, dtype=dtype, name="y") z = te.compute(outer_shape, lambda i: x[i] + y[i], name="z") s = te.create_schedule(z.op) x_global = s.cache_read(x, "global.vtcm", [z]) y_global = s.cache_read(y, "global.vtcm", [z]) z_global = s.cache_write(z, "global.vtcm") zouter, zinner = s[z_global].split(z_global.op.axis[0], factor=factor) s[x_global].compute_at(s[z_global], zouter) s[y_global].compute_at(s[z_global], zouter) mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global") (cache_read_x, ) = s[x_global].op.axis s[x_global].tensorize(cache_read_x, mem_copy_read) (cache_read_y, ) = s[y_global].op.axis s[y_global].tensorize(cache_read_y, mem_copy_read) mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm") (cache_write_z, ) = s[z].op.axis s[z].tensorize(cache_write_z, mem_copy_write) print(tvm.lower(s, [x, y, z])) target_hexagon = tvm.target.hexagon("v68", link_params=True) func = tvm.build(s, [x, y, z], tvm.target.Target(target_hexagon, host=target_hexagon), name="dmacpy") temp = utils.tempdir() dso_binary = "test_binary.so" dso_binary_path = temp.relpath(dso_binary) func.save(dso_binary_path) if not android_serial_number: pytest.skip( "Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") rpc_info = { "rpc_tracker_host": tvm_tracker_host, "rpc_tracker_port": tvm_tracker_port, "rpc_server_port": 7070, "adb_server_socket": adb_server_socket, } launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) launcher.upload(dso_binary_path, dso_binary) launcher.start_server() with launcher.start_session() as sess: mod = launcher.load_module(dso_binary, sess) xt = tvm.nd.array(np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device) yt = tvm.nd.array(np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device) zt = tvm.nd.array(np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device) mod["dmacpy"](xt, yt, zt) launcher.stop_server() ref = xt.numpy() + yt.numpy() np.testing.assert_equal(zt.numpy(), ref)
def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype, output_padding): """Transposed 2D convolution nchw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.te.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.te.Tensor 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision output_padding : tuple of two ints Used to disambiguate output shape. Returns ------- Output : tvm.te.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_height, kernel_width = get_const_tuple( kernel.shape) stride_height, stride_width = stride outpad_height, outpad_width = output_padding assert outpad_height < stride_height and outpad_width < stride_width cfg.stride = stride pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple( padding, (kernel_height, kernel_width)) out_width = (inp_width - 1) * stride_width + \ kernel_width - pad_left - pad_right + outpad_width pad_left = kernel_width - 1 - pad_left pad_right = kernel_width - 1 - pad_right dilated_width = stride_width * (inp_width - 1) + 1 out_height = (inp_height - 1) * stride_height + \ kernel_height - pad_top - pad_bottom + outpad_height pad_top = kernel_height - 1 - pad_top pad_bottom = kernel_height - 1 - pad_bottom dilated_height = stride_height * (inp_height - 1) + 1 # compute pad data = te.compute( (batch, inp_channels, pad_top + dilated_height + pad_bottom, pad_left + dilated_width + pad_right), lambda n, c, y, x: tvm.tir.if_then_else( tvm.tir.all(x >= pad_left, x < pad_left + dilated_width, tvm.tir.indexmod(x - pad_left, stride_width).equal(0), y >= pad_top, y < pad_top + dilated_height, tvm.tir.indexmod(y - pad_top, stride_height).equal(0)), data[n, c, tvm.tir.indexdiv(y - pad_top, stride_height), tvm.tir.indexdiv(x - pad_left, stride_width)], tvm.tir.const(0., "float32")), name='data_pad') # compute transposed conv dc = te.reduce_axis((0, inp_channels), name='dc') dh = te.reduce_axis((0, kernel_height), name='dh') dw = te.reduce_axis((0, kernel_width), name='dw') data_out = te.compute( (batch, out_channels, out_height, out_width), lambda b, c, h, w: te.sum(data[b, dc, h + dh, w + dw].astype( out_dtype) * kernel[dc, c, kernel_height - 1 - dh, kernel_width - 1 - dw].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return data_out
def test_rpc_module(): # graph n = tvm.runtime.convert(1024) A = te.placeholder((n,), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") a_np = np.random.uniform(size=1024).astype(A.dtype) temp = utils.tempdir() # Establish remote connection with target hardware tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=60) # Compile the Graph for CPU target s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso_cpu = temp.relpath("cpu_lib.so") f.export_library(path_dso_cpu, ndk.create_shared) # Execute the portable graph on cpu target print("Run CPU test ...") dev = remote.cpu(0) remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) time_f = f2.time_evaluator(f2.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) # Compile the Graph for OpenCL target if test_opencl: s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, te.thread_axis("threadIdx.x")) s[B].bind(xo, te.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") path_dso_cl = temp.relpath("dev_lib_cl.so") f.export_library(path_dso_cl, ndk.create_shared) print("Run GPU(OpenCL Flavor) test ...") dev = remote.cl(0) remote.upload(path_dso_cl) f1 = remote.load_module("dev_lib_cl.so") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) # Compile the Graph for Vulkan target if test_vulkan: s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, te.thread_axis("threadIdx.x")) s[B].bind(xo, te.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd") path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") f.export_library(path_dso_vulkan, ndk.create_shared) print("Run GPU(Vulkan Flavor) test ...") dev = remote.vulkan(0) remote.upload(path_dso_vulkan) f1 = remote.load_module("dev_lib_vulkan.so") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1)