def test_vectorize_if_then_else(): n = tvm.var('n') x = tvm.var('x') ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, 4, for_type="vectorize") as i: A[i] = tvm.call_intrin("float32", "tvm_if_then_else", i > 0, A[i] + 1, A[i]) stmt = ib.get() stmt = tvm.ir_pass.VectorizeLoop(stmt) assert isinstance(stmt, tvm.stmt.For) ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, n) as k: with ib.for_range(0, 4, for_type="vectorize") as i: A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else", k > 0, A[k * 4 + i], 0) stmt = ib.get() assert isinstance(stmt.body, tvm.stmt.For) stmt = tvm.ir_pass.VectorizeLoop(stmt) assert not isinstance(stmt.body, tvm.stmt.For) assert isinstance(stmt.body.value.args[2], tvm.expr.Broadcast)
def intrin_func(ins, outs): ib = tvm.ir_builder.create() if (add_on == None): int4copy = tvm.call_intrin('float32', 'int4_copy', yb.access_ptr("w"), 0, xb.access_ptr("r"), offset_x) ib.emit(int4copy) elif (add_on == "relu"): relu = tvm.call_intrin('float32', 'relu', xb.access_ptr("w"), 0) ib.emit(relu) int4copy = tvm.call_intrin('float32', 'int4_copy', yb.access_ptr("w"), 0, xb.access_ptr("r"), offset_x) ib.emit(int4copy) return ib.get()
def intrin_func(ins, outs): ib = tvm.ir_builder.create() BA = ins[0] BC = outs[0] ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync', BA.data, n, n, n, BA.elem_offset // 256, BC.access_ptr('w'), n, 'row_major')) return ib.get()
def update(): ib = tvm.ir_builder.create() ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync', BC.data, BC.elem_offset // 256, BA.data, BA.elem_offset // 256, BB.data, BB.elem_offset // 256, BC.data, BC.elem_offset // 256)) return ib.get()
def intrin_func(ins, outs): ib = tvm.ir_builder.create() BA = ins[0] BC = outs[0] ib.emit( tvm.call_intrin('handle', 'tvm_load_matrix_sync', BC.data, n, m, l, BC.elem_offset // (row * col), BA.access_ptr('r'), col, 'row_major')) return ib.get()
def test_vectorize_if_then_else(): n = tvm.var('n') x = tvm.var('x') ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, 4, for_type="vectorize") as i: A[i] = tvm.call_intrin("float32", "tvm_if_then_else", i > 0, A[i] + 1, A[i]) stmt = ib.get() stmt = tvm.ir_pass.VectorizeLoop(stmt) assert isinstance(stmt, tvm.stmt.For) ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, n) as k: with ib.for_range(0, 4, for_type="vectorize") as i: A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else", k > 0, A[k * 4 + i], 0) stmt = ib.get() assert isinstance(stmt.body, tvm.stmt.For) stmt = tvm.ir_pass.VectorizeLoop(stmt) assert not isinstance(stmt.body, tvm.stmt.For) assert isinstance(stmt.body.value.args[2], tvm.expr.Broadcast)
def test_static_init(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') ib = tvm.ir_builder.create() handle = tvm.call_intrin("handle", "tvm_static_handle") ib.emit(tvm.call_packed("test_static_callback", handle, Ab)) @tvm.register_func("test_static_callback") def test_cb(sh, A): assert isinstance(sh, ctypes.c_void_p) return sh stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) f = tvm.codegen.build_module(fapi, "llvm") a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a)
def test_static_init(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') ib = tvm.ir_builder.create() handle = tvm.call_intrin("handle", "tvm_static_handle") ib.emit( tvm.call_packed("test_static_callback", handle, Ab)) @tvm.register_func("test_static_callback") def test_cb(sh, A): assert isinstance(sh, ctypes.c_void_p) return sh stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) f = tvm.codegen.build_module(fapi, "llvm") a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a)
def init(): ib = tvm.ir_builder.create() ib.emit( tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, m, l, BC.elem_offset // (n * m), 0.0)) return ib.get()
def convolutionfp16(D, F, shmem): #ir builder for constructing the main body ib = tvm.ir_builder.create() #id of current warp and offset of shared memory when storing warpid = tidx / 32 warp_offset_output = warpid%block_row_warp*16*warp_row_tile\ +warpid/block_row_warp*warp_col_tile*block_row_warp*warp_row_tile*256 #include necessary head files include_file = tvm.call_intrin("float32", "include_cpp_head", dir_path + "/conv2d_HMMA.h") ib.emit(include_file) #declare the matrix fragment declare_a = tvm.call_intrin("float32", "wmma_fragment", "matrix_a", "half", "row_major", "a_frag", warp_col_tile) declare_b = tvm.call_intrin("float32", "wmma_fragment", "matrix_b", "half", "col_major", "b_frag", warp_row_tile) declare_c = tvm.call_intrin("float32", "wmma_fragment", "accumulator", "half", "c_frag", warp_col_tile, warp_row_tile) ib.emit(declare_a) ib.emit(declare_b) ib.emit(declare_c) #define the shared memory for loading data and offset for loading the data offset_D_warp = offset_D_im2col + tidx / 2 * (16 + shieft) + tidx % 2 * 8 offset_F_warp = offset_F + tidx / 2 * (16 + shieft) + tidx % 2 * 8 #ir template for thread synchronization sync = tvm.call_extern("float32", "__syncthreads") #main for conducting the computation #set the pointer to first address of D Dp = D.access_ptr("r") Sp = shmem.access_ptr("r") Fp = F.access_ptr("r") #load the first data from global memory for the reuse of 9 times load_first_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\ output_shape[0],data_shape[1],data_shape[2],data_shape[3],0,dilation,0) ib.emit(load_first_data) #load the first filter from global memory: load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\ kernel_shape[3],data_shape[0],data_shape[1],data_shape[2],tidx%2*8,0,0) ib.emit(load_filter) #fill fragment c with 0 with ib.for_range(0, warp_col_tile, name="col_id_fi") as col_id_fi: with ib.for_range(0, warp_row_tile, name="row_id_fi") as row_id_fi: fill_O_zero = tvm.call_intrin("float", "wmma_fill_fragment", "c_frag", col_id_fi, row_id_fi, "half", 0.) ib.emit(fill_O_zero) ib.emit(sync) #do im2col for the first data im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, 0, 0) ib.emit(im2col) ib.emit(sync) with ib.for_range(0, data_shape[3] / 16, name="c_id", for_type=fortype) as c_id: with ib.for_range(0, 9, name="ker_id", for_type=fortype) as ker_id: #now load matrix fragment with ib.for_range(0, warp_col_tile, name="col") as col: load_matrix_frag_F = tvm.call_intrin("float32","wmma_load_matrix_sync","a_frag",col,Sp,\ offset_D_im2col+tidx/(32*block_row_warp)*\ (16*warp_col_tile*(16+shieft))+col*(16*(16+shieft)),16+shieft) ib.emit(load_matrix_frag_F) with ib.for_range(0, warp_row_tile, name="row") as row: load_matrix_frag_D = tvm.call_intrin("float32","wmma_load_matrix_sync","b_frag",row,Sp,\ offset_F+tidx%(32*block_row_warp)/32*\ (16*warp_row_tile*(16+shieft))+row*(16*(16+shieft)),16+shieft) ib.emit(load_matrix_frag_D) ib.emit(sync) #now compute with ib.for_range(0, warp_col_tile, name="mma_col") as mma_col: with ib.for_range(0, warp_row_tile, name="mma_row") as mma_row: wmma_compute = tvm.call_intrin("float16", "wmma_mma_sync", "c_frag", "a_frag", "b_frag", "c_frag", mma_col, mma_row) ib.emit(wmma_compute) with ib.if_scope(ker_id < 8): #load filer of the next ieration load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],kernel_shape[3],\ data_shape[0],data_shape[1],data_shape[2],c_id*16+tidx%2*8,ker_id+1,0) ib.emit(load_filter) #load data for next iteration im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, ker_id + 1, 0) ib.emit(im2col) ib.emit(sync) with ib.if_scope(c_id < data_shape[3] / 16 - 1): #load the next 9 iteration data from global memory load_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\ output_shape[0],output_shape[1],output_shape[2],data_shape[3],c_id*16+16,dilation,0) ib.emit(load_data) #load filter for next cd iter load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\ data_shape[3],data_shape[0],data_shape[1],data_shape[2],c_id*16+16+tidx%2*8,0,0) ib.emit(load_filter) ib.emit(sync) #load the first data from shmem to im2col shmem im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, 0, 0) ib.emit(im2col) ib.emit(sync) #store fragment in shared memory first with ib.for_range(0, warp_col_tile, name="col_id_st") as col_id_st: with ib.for_range(0, warp_row_tile, name="row_id_st") as row_id_st: store_O_fragment = tvm.call_intrin( "float32", "wmma_store_matrix_sync", Sp, warp_offset_output + col_id_st * (256 * warp_row_tile * block_row_warp) + row_id_st * 16, "c_frag", col_id_st, row_id_st, 64) ib.emit(store_O_fragment) ib.emit(sync) body = ib.get() return (body)
def _do_fold(op): if _match_pragma(op, "conv2d_transpose_gemm"): is_init = ".init" in str(op) tvm.ir_pass.PostOrderVisit(op, _find_basics) if is_init: # create inner most block irb = tvm.ir_builder.create() dev = env.dev irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) irb.emit( tvm.call_extern("int32", "VTAUopPush", 0, 1, dout.access_ptr("rw", "int32"), 0, 0, 0, 0, 0)) inner = irb.get() args = op.body.body.args res_tensor = op.body.body.func.output(0) tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT) inner = tvm.make.AttrStmt( [dout, res_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) return inner else: conv_call, data_call, kernel_call = calls[-3:] pad_data_tensor = data_call.func.output(0) kernel_tensor = kernel_call.func.output(0) res_tensor = conv_call.func.output(0) if selects: condition = selects[0].condition else: condition = tvm.const(1, 'int') # create inner most block irb = tvm.ir_builder.create() with irb.if_scope(condition): dev = env.dev irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) irb.emit( tvm.call_extern("int32", "VTAUopPush", 0, 0, dout.access_ptr("rw", "int32"), dinp.access_ptr("r", "int32"), dwgt.access_ptr("r", "int32"), 0, 0, 0)) inner = irb.get() args = conv_call.args tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT) inner = tvm.make.AttrStmt( [dout, res_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) args = kernel_call.args tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, env.BLOCK_OUT, 0, env.BLOCK_IN) inner = tvm.make.AttrStmt( [dwgt, kernel_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) args = data_call.args tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_IN) inner = tvm.make.AttrStmt( [dinp, pad_data_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) return inner return None
def convolutionf16(D, F, temp, O): ib = tvm.ir_builder.create() #define the computation architecture block_x = tvm.thread_axis('blockIdx.x') ib.scope_attr(block_x, 'thread_extent', block_num) thread_x = tvm.thread_axis('threadIdx.x') ib.scope_attr(thread_x, 'thread_extent', thread_num) bidx = block_x tidx = thread_x warpid = tidx / 32 warp_offset_output = warpid%block_row_warp*16*warp_row_tile\ +warpid/block_row_warp*warp_col_tile*block_row_warp*warp_row_tile*256 #include files include_file = tvm.call_intrin( "float32", "include_cpp_head", "/home/tusimple/Desktop/tvm_ir_test/conv2dv8.h") ib.emit(include_file) #define the double buffered shared memory declare_a = tvm.call_intrin("float32", "wmma_fragment", "matrix_a", "half", "row_major", "a_frag", warp_col_tile) declare_b = tvm.call_intrin("float32", "wmma_fragment", "matrix_b", "half", "col_major", "b_frag", warp_row_tile) declare_c = tvm.call_intrin("float32", "wmma_fragment", "accumulator", "half", "c_frag", warp_col_tile, warp_row_tile) ib.emit(declare_a) ib.emit(declare_b) ib.emit(declare_c) #define the shared memory for loading data and offset for loading the data shmem = ib.allocate("float16", 24576, name="shmem", scope="shared") offset_D_warp = offset_D_im2col + tidx / 2 * (16 + shieft) + tidx % 2 * 8 offset_F_warp = offset_F + tidx / 2 * (16 + shieft) + tidx % 2 * 8 #sync thread syntex sync = tvm.call_extern("float32", "__syncthreads") #since filter is usually small, load all filer used by the #define the main loop and calculate with ib.for_range(0, loop_len, name="blk_id") as blk_id: with ib.if_scope(bidx + blk_id * block_num < (rD * rF) / (block_size_r * block_size_c)): #set the pointer to beginning of D Dp = D.access_ptr("r") #load the first data from global memory for the reuse of 9 times load_first_data = tvm.call_extern("float32", "load_matrix_D", Dp, shmem, blk_id, N, H, W, C, 0, 64) ib.emit(load_first_data) #set the pointer to beginning of F Fp = F.access_ptr("r") #load the first filter from global memory: load_filter = tvm.call_extern("float32", "load_matrix_F", Fp, shmem, offset_F_warp, blk_id, K, C, N, H, W, tidx % 2 * 8, 0, 64) ib.emit(load_filter) with ib.for_range(0, warp_col_tile, name="col_id_fi") as col_id_fi: with ib.for_range(0, warp_row_tile, name="row_id_fi") as row_id_fi: fill_O_zero = tvm.call_intrin("float", "wmma_fill_fragment", "c_frag", col_id_fi, row_id_fi, "half", 0.) ib.emit(fill_O_zero) ib.emit(sync) #load the first data from shmem to im2col shmem im2col = tvm.call_extern("float32", "im2col", shmem, offset_D_warp, 0, 64) ib.emit(im2col) ib.emit(sync) #load temp with ib.for_range(0, C / 16, name="c_id", for_type='unroll') as c_id: with ib.for_range(0, 9, name="ker_id", for_type='unroll') as ker_id: #now load matrix fragment with ib.for_range(0, warp_col_tile, name="col") as col: load_matrix_frag_F = tvm.call_intrin("float32","wmma_load_matrix_sync","a_frag",col,shmem,\ offset_F+tidx/(32*block_row_warp)*\ (16*warp_col_tile*(16+shieft))+col*(16*(16+shieft)),16+shieft) ib.emit(load_matrix_frag_F) with ib.for_range(0, warp_row_tile, name="row") as row: load_matrix_frag_D = tvm.call_intrin("float32","wmma_load_matrix_sync","b_frag",row,shmem,\ offset_D_im2col+tidx%(32*block_row_warp)/32*\ (16*warp_row_tile*(16+shieft))+row*(16*(16+shieft)),16+shieft) ib.emit(load_matrix_frag_D) ib.emit(sync) #now compute with ib.for_range(0, warp_col_tile, name="mma_col") as mma_col: with ib.for_range(0, warp_row_tile, name="mma_row") as mma_row: wmma_compute = tvm.call_intrin( "float16", "wmma_mma_sync", "c_frag", "a_frag", "b_frag", "c_frag", mma_col, mma_row) ib.emit(wmma_compute) with ib.if_scope(ker_id < 8): #load filer of the next ieration load_filter = tvm.call_extern("float32", "load_matrix_F", Fp, shmem, offset_F_warp, blk_id, K, C, N, H, W, c_id * 16 + tidx % 2 * 8, ker_id + 1, 64) ib.emit(load_filter) #load data for next iteration im2col = tvm.call_extern("float32", "im2col", shmem, offset_D_warp, ker_id + 1, 64) ib.emit(im2col) ib.emit(sync) with ib.if_scope(c_id < C / 16 - 1): #load the next 9 iteration data from global memory load_data = tvm.call_extern("float32", "load_matrix_D", Dp, shmem, blk_id, N, H, W, C, c_id * 16 + 16, 64) ib.emit(load_data) #load filter for next cd iter load_filter = tvm.call_extern( "float32", "load_matrix_F", Fp, shmem, offset_F_warp, blk_id, K, C, N, H, W, c_id * 16 + 16 + tidx % 2 * 8, 0, 64) ib.emit(load_filter) ib.emit(sync) #load the first data from shmem to im2col shmem im2col = tvm.call_extern("float32", "im2col", shmem, offset_D_warp, 0, 64) ib.emit(im2col) ib.emit(sync) #now start reload back to output #load fragment to shared memory first with ib.for_range(0, warp_col_tile, name="col_id_st") as col_id_st: with ib.for_range(0, warp_row_tile, name="row_id_st") as row_id_st: store_O_fragment = tvm.call_intrin( "float32", "wmma_store_matrix_sync", shmem, warp_offset_output + col_id_st * (256 * warp_row_tile * block_row_warp) + row_id_st * 16, "c_frag", col_id_st, row_id_st, 64) ib.emit(store_O_fragment) ib.emit(sync) Op = O.access_ptr("w") store_O = tvm.call_extern("float32", "store_output", Op, shmem, blk_id, N, P, Q, K, 64) ib.emit(store_O) ib.emit(sync) body = ib.get() return (body)
def my_log(x): # 通过内建函数构建表达式 my_log:内建函数名 return tvm.call_intrin(x.dtype, 'my_log', x)