コード例 #1
0
ファイル: test_pass_vectorize.py プロジェクト: bddppq/tvm
def test_vectorize_if_then_else():
    n = tvm.var('n')
    x = tvm.var('x')
    ib = tvm.ir_builder.create()
    A = ib.pointer("float32", name="A")
    with ib.for_range(0, 4, for_type="vectorize") as i:
        A[i] = tvm.call_intrin("float32", "tvm_if_then_else",
                               i > 0,
                               A[i] + 1, A[i])
    stmt = ib.get()
    stmt = tvm.ir_pass.VectorizeLoop(stmt)
    assert isinstance(stmt, tvm.stmt.For)


    ib = tvm.ir_builder.create()
    A = ib.pointer("float32", name="A")
    with ib.for_range(0, n) as k:
        with ib.for_range(0, 4, for_type="vectorize") as i:
            A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else",
                                           k > 0,
                                           A[k * 4 + i], 0)
    stmt = ib.get()
    assert isinstance(stmt.body, tvm.stmt.For)
    stmt = tvm.ir_pass.VectorizeLoop(stmt)
    assert not isinstance(stmt.body, tvm.stmt.For)
    assert isinstance(stmt.body.value.args[2], tvm.expr.Broadcast)
コード例 #2
0
 def intrin_func(ins, outs):
     ib = tvm.ir_builder.create()
     if (add_on == None):
         int4copy = tvm.call_intrin('float32', 'int4_copy',
                                    yb.access_ptr("w"), 0,
                                    xb.access_ptr("r"), offset_x)
         ib.emit(int4copy)
     elif (add_on == "relu"):
         relu = tvm.call_intrin('float32', 'relu', xb.access_ptr("w"), 0)
         ib.emit(relu)
         int4copy = tvm.call_intrin('float32', 'int4_copy',
                                    yb.access_ptr("w"), 0,
                                    xb.access_ptr("r"), offset_x)
         ib.emit(int4copy)
     return ib.get()
コード例 #3
0
 def intrin_func(ins, outs):
     ib = tvm.ir_builder.create()
     BA = ins[0]
     BC = outs[0]
     ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
                             BA.data, n, n, n, BA.elem_offset // 256,
                             BC.access_ptr('w'), n, 'row_major'))
     return ib.get()
コード例 #4
0
 def update():
     ib = tvm.ir_builder.create()
     ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
                             BC.data, BC.elem_offset // 256,
                             BA.data, BA.elem_offset // 256,
                             BB.data, BB.elem_offset // 256,
                             BC.data, BC.elem_offset // 256))
     return ib.get()
コード例 #5
0
    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()

        BA = ins[0]
        BC = outs[0]
        ib.emit(
            tvm.call_intrin('handle', 'tvm_load_matrix_sync', BC.data, n,
                            m, l, BC.elem_offset // (row * col),
                            BA.access_ptr('r'), col, 'row_major'))
        return ib.get()
コード例 #6
0
def test_vectorize_if_then_else():
    n = tvm.var('n')
    x = tvm.var('x')
    ib = tvm.ir_builder.create()
    A = ib.pointer("float32", name="A")
    with ib.for_range(0, 4, for_type="vectorize") as i:
        A[i] = tvm.call_intrin("float32", "tvm_if_then_else", i > 0, A[i] + 1,
                               A[i])
    stmt = ib.get()
    stmt = tvm.ir_pass.VectorizeLoop(stmt)
    assert isinstance(stmt, tvm.stmt.For)

    ib = tvm.ir_builder.create()
    A = ib.pointer("float32", name="A")
    with ib.for_range(0, n) as k:
        with ib.for_range(0, 4, for_type="vectorize") as i:
            A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else",
                                           k > 0, A[k * 4 + i], 0)
    stmt = ib.get()
    assert isinstance(stmt.body, tvm.stmt.For)
    stmt = tvm.ir_pass.VectorizeLoop(stmt)
    assert not isinstance(stmt.body, tvm.stmt.For)
    assert isinstance(stmt.body.value.args[2], tvm.expr.Broadcast)
コード例 #7
0
def test_static_init():
    dtype = 'int64'
    n = tvm.var('n')
    Ab = tvm.decl_buffer((n, ), dtype)
    i = tvm.var('i')
    ib = tvm.ir_builder.create()
    handle = tvm.call_intrin("handle", "tvm_static_handle")
    ib.emit(tvm.call_packed("test_static_callback", handle, Ab))

    @tvm.register_func("test_static_callback")
    def test_cb(sh, A):
        assert isinstance(sh, ctypes.c_void_p)
        return sh

    stmt = ib.get()
    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
    f = tvm.codegen.build_module(fapi, "llvm")
    a = tvm.nd.array(np.zeros(10, dtype=dtype))
    f(a)
コード例 #8
0
def test_static_init():
    dtype = 'int64'
    n = tvm.var('n')
    Ab = tvm.decl_buffer((n, ), dtype)
    i = tvm.var('i')
    ib = tvm.ir_builder.create()
    handle = tvm.call_intrin("handle", "tvm_static_handle")
    ib.emit(
        tvm.call_packed("test_static_callback", handle, Ab))

    @tvm.register_func("test_static_callback")
    def test_cb(sh, A):
        assert isinstance(sh, ctypes.c_void_p)
        return sh

    stmt = ib.get()
    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
    f = tvm.codegen.build_module(fapi, "llvm")
    a = tvm.nd.array(np.zeros(10, dtype=dtype))
    f(a)
コード例 #9
0
 def init():
     ib = tvm.ir_builder.create()
     ib.emit(
         tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, m,
                         l, BC.elem_offset // (n * m), 0.0))
     return ib.get()
コード例 #10
0
    def convolutionfp16(D, F, shmem):
        #ir builder for constructing the main body
        ib = tvm.ir_builder.create()

        #id of current warp and offset of shared memory when storing
        warpid = tidx / 32
        warp_offset_output = warpid%block_row_warp*16*warp_row_tile\
            +warpid/block_row_warp*warp_col_tile*block_row_warp*warp_row_tile*256

        #include necessary head files
        include_file = tvm.call_intrin("float32", "include_cpp_head",
                                       dir_path + "/conv2d_HMMA.h")
        ib.emit(include_file)

        #declare the matrix fragment
        declare_a = tvm.call_intrin("float32", "wmma_fragment", "matrix_a",
                                    "half", "row_major", "a_frag",
                                    warp_col_tile)
        declare_b = tvm.call_intrin("float32", "wmma_fragment", "matrix_b",
                                    "half", "col_major", "b_frag",
                                    warp_row_tile)
        declare_c = tvm.call_intrin("float32", "wmma_fragment", "accumulator",
                                    "half", "c_frag", warp_col_tile,
                                    warp_row_tile)
        ib.emit(declare_a)
        ib.emit(declare_b)
        ib.emit(declare_c)

        #define the shared memory for loading data and offset for loading the data
        offset_D_warp = offset_D_im2col + tidx / 2 * (16 +
                                                      shieft) + tidx % 2 * 8
        offset_F_warp = offset_F + tidx / 2 * (16 + shieft) + tidx % 2 * 8

        #ir template for thread synchronization
        sync = tvm.call_extern("float32", "__syncthreads")

        #main for conducting the computation
        #set the pointer to first address of D
        Dp = D.access_ptr("r")
        Sp = shmem.access_ptr("r")
        Fp = F.access_ptr("r")

        #load the first data from global memory for the reuse of 9 times
        load_first_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\
                                            output_shape[0],data_shape[1],data_shape[2],data_shape[3],0,dilation,0)
        ib.emit(load_first_data)

        #load the first filter from global memory:
        load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\
                                    kernel_shape[3],data_shape[0],data_shape[1],data_shape[2],tidx%2*8,0,0)
        ib.emit(load_filter)

        #fill fragment c with 0
        with ib.for_range(0, warp_col_tile, name="col_id_fi") as col_id_fi:
            with ib.for_range(0, warp_row_tile, name="row_id_fi") as row_id_fi:
                fill_O_zero = tvm.call_intrin("float", "wmma_fill_fragment",
                                              "c_frag", col_id_fi, row_id_fi,
                                              "half", 0.)
                ib.emit(fill_O_zero)
        ib.emit(sync)

        #do im2col for the first data
        im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, 0, 0)
        ib.emit(im2col)
        ib.emit(sync)

        with ib.for_range(0, data_shape[3] / 16, name="c_id",
                          for_type=fortype) as c_id:
            with ib.for_range(0, 9, name="ker_id", for_type=fortype) as ker_id:
                #now load matrix fragment
                with ib.for_range(0, warp_col_tile, name="col") as col:
                    load_matrix_frag_F = tvm.call_intrin("float32","wmma_load_matrix_sync","a_frag",col,Sp,\
                                                        offset_D_im2col+tidx/(32*block_row_warp)*\
                                                        (16*warp_col_tile*(16+shieft))+col*(16*(16+shieft)),16+shieft)
                    ib.emit(load_matrix_frag_F)

                with ib.for_range(0, warp_row_tile, name="row") as row:
                    load_matrix_frag_D = tvm.call_intrin("float32","wmma_load_matrix_sync","b_frag",row,Sp,\
                                                        offset_F+tidx%(32*block_row_warp)/32*\
                                                        (16*warp_row_tile*(16+shieft))+row*(16*(16+shieft)),16+shieft)
                    ib.emit(load_matrix_frag_D)
                ib.emit(sync)
                #now compute
                with ib.for_range(0, warp_col_tile, name="mma_col") as mma_col:
                    with ib.for_range(0, warp_row_tile,
                                      name="mma_row") as mma_row:
                        wmma_compute = tvm.call_intrin("float16",
                                                       "wmma_mma_sync",
                                                       "c_frag", "a_frag",
                                                       "b_frag", "c_frag",
                                                       mma_col, mma_row)
                        ib.emit(wmma_compute)

                with ib.if_scope(ker_id < 8):
                    #load filer of the next ieration
                    load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],kernel_shape[3],\
                                                data_shape[0],data_shape[1],data_shape[2],c_id*16+tidx%2*8,ker_id+1,0)
                    ib.emit(load_filter)
                    #load data for next iteration
                    im2col = tvm.call_extern("float32", "im2col", Sp,
                                             offset_D_warp, ker_id + 1, 0)
                    ib.emit(im2col)
                ib.emit(sync)

            with ib.if_scope(c_id < data_shape[3] / 16 - 1):
                #load the next 9 iteration data from global memory
                load_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\
                                output_shape[0],output_shape[1],output_shape[2],data_shape[3],c_id*16+16,dilation,0)
                ib.emit(load_data)

                #load filter for next cd iter
                load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\
                                            data_shape[3],data_shape[0],data_shape[1],data_shape[2],c_id*16+16+tidx%2*8,0,0)
                ib.emit(load_filter)
                ib.emit(sync)

                #load the first data from shmem to im2col shmem
                im2col = tvm.call_extern("float32", "im2col", Sp,
                                         offset_D_warp, 0, 0)
                ib.emit(im2col)
                ib.emit(sync)

        #store fragment in shared memory first
        with ib.for_range(0, warp_col_tile, name="col_id_st") as col_id_st:
            with ib.for_range(0, warp_row_tile, name="row_id_st") as row_id_st:
                store_O_fragment = tvm.call_intrin(
                    "float32", "wmma_store_matrix_sync", Sp,
                    warp_offset_output + col_id_st *
                    (256 * warp_row_tile * block_row_warp) + row_id_st * 16,
                    "c_frag", col_id_st, row_id_st, 64)
                ib.emit(store_O_fragment)
        ib.emit(sync)

        body = ib.get()
        return (body)
コード例 #11
0
ファイル: ir_pass.py プロジェクト: hoseung2/Latency-Predictor
    def _do_fold(op):
        if _match_pragma(op, "conv2d_transpose_gemm"):
            is_init = ".init" in str(op)
            tvm.ir_pass.PostOrderVisit(op, _find_basics)

            if is_init:
                # create inner most block
                irb = tvm.ir_builder.create()
                dev = env.dev
                irb.scope_attr(dev.vta_axis, "coproc_scope",
                               dev.get_task_qid(dev.QID_COMPUTE))
                irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                               dev.vta_push_uop)
                irb.emit(
                    tvm.call_extern("int32", "VTAUopPush", 0, 1,
                                    dout.access_ptr("rw", "int32"), 0, 0, 0, 0,
                                    0))
                inner = irb.get()
                args = op.body.body.args
                res_tensor = op.body.body.func.output(0)
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0,
                       env.BLOCK_OUT)
                inner = tvm.make.AttrStmt(
                    [dout, res_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                return inner
            else:
                conv_call, data_call, kernel_call = calls[-3:]
                pad_data_tensor = data_call.func.output(0)
                kernel_tensor = kernel_call.func.output(0)
                res_tensor = conv_call.func.output(0)

                if selects:
                    condition = selects[0].condition
                else:
                    condition = tvm.const(1, 'int')

                # create inner most block
                irb = tvm.ir_builder.create()
                with irb.if_scope(condition):
                    dev = env.dev
                    irb.scope_attr(dev.vta_axis, "coproc_scope",
                                   dev.get_task_qid(dev.QID_COMPUTE))
                    irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                                   dev.vta_push_uop)
                    irb.emit(
                        tvm.call_extern("int32", "VTAUopPush", 0, 0,
                                        dout.access_ptr("rw", "int32"),
                                        dinp.access_ptr("r", "int32"),
                                        dwgt.access_ptr("r", "int32"), 0, 0,
                                        0))
                inner = irb.get()

                args = conv_call.args
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0,
                       env.BLOCK_OUT)
                inner = tvm.make.AttrStmt(
                    [dout, res_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                args = kernel_call.args
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0,
                       env.BLOCK_OUT, 0, env.BLOCK_IN)
                inner = tvm.make.AttrStmt(
                    [dwgt, kernel_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                args = data_call.args
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0,
                       env.BLOCK_IN)
                inner = tvm.make.AttrStmt(
                    [dinp, pad_data_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                return inner
        return None
コード例 #12
0
def convolutionf16(D, F, temp, O):
    ib = tvm.ir_builder.create()
    #define the computation architecture
    block_x = tvm.thread_axis('blockIdx.x')
    ib.scope_attr(block_x, 'thread_extent', block_num)
    thread_x = tvm.thread_axis('threadIdx.x')
    ib.scope_attr(thread_x, 'thread_extent', thread_num)

    bidx = block_x
    tidx = thread_x
    warpid = tidx / 32
    warp_offset_output = warpid%block_row_warp*16*warp_row_tile\
    +warpid/block_row_warp*warp_col_tile*block_row_warp*warp_row_tile*256

    #include files
    include_file = tvm.call_intrin(
        "float32", "include_cpp_head",
        "/home/tusimple/Desktop/tvm_ir_test/conv2dv8.h")
    ib.emit(include_file)

    #define the double buffered shared memory
    declare_a = tvm.call_intrin("float32", "wmma_fragment", "matrix_a", "half",
                                "row_major", "a_frag", warp_col_tile)
    declare_b = tvm.call_intrin("float32", "wmma_fragment", "matrix_b", "half",
                                "col_major", "b_frag", warp_row_tile)
    declare_c = tvm.call_intrin("float32", "wmma_fragment", "accumulator",
                                "half", "c_frag", warp_col_tile, warp_row_tile)
    ib.emit(declare_a)
    ib.emit(declare_b)
    ib.emit(declare_c)

    #define the shared memory for loading data and offset for loading the data
    shmem = ib.allocate("float16", 24576, name="shmem", scope="shared")
    offset_D_warp = offset_D_im2col + tidx / 2 * (16 + shieft) + tidx % 2 * 8
    offset_F_warp = offset_F + tidx / 2 * (16 + shieft) + tidx % 2 * 8

    #sync thread syntex
    sync = tvm.call_extern("float32", "__syncthreads")

    #since filter is usually small, load all filer used by the

    #define the main loop and calculate
    with ib.for_range(0, loop_len, name="blk_id") as blk_id:
        with ib.if_scope(bidx + blk_id * block_num < (rD * rF) /
                         (block_size_r * block_size_c)):
            #set the pointer to beginning of D
            Dp = D.access_ptr("r")
            #load the first data from global memory for the reuse of 9 times
            load_first_data = tvm.call_extern("float32", "load_matrix_D", Dp,
                                              shmem, blk_id, N, H, W, C, 0, 64)
            ib.emit(load_first_data)

            #set the pointer to beginning of F
            Fp = F.access_ptr("r")

            #load the first filter from global memory:
            load_filter = tvm.call_extern("float32", "load_matrix_F", Fp,
                                          shmem, offset_F_warp, blk_id, K, C,
                                          N, H, W, tidx % 2 * 8, 0, 64)
            ib.emit(load_filter)

            with ib.for_range(0, warp_col_tile, name="col_id_fi") as col_id_fi:
                with ib.for_range(0, warp_row_tile,
                                  name="row_id_fi") as row_id_fi:
                    fill_O_zero = tvm.call_intrin("float",
                                                  "wmma_fill_fragment",
                                                  "c_frag", col_id_fi,
                                                  row_id_fi, "half", 0.)
                    ib.emit(fill_O_zero)
            ib.emit(sync)

            #load the first data from shmem to im2col shmem
            im2col = tvm.call_extern("float32", "im2col", shmem, offset_D_warp,
                                     0, 64)
            ib.emit(im2col)
            ib.emit(sync)

            #load temp

            with ib.for_range(0, C / 16, name="c_id",
                              for_type='unroll') as c_id:
                with ib.for_range(0, 9, name="ker_id",
                                  for_type='unroll') as ker_id:

                    #now load matrix fragment
                    with ib.for_range(0, warp_col_tile, name="col") as col:
                        load_matrix_frag_F = tvm.call_intrin("float32","wmma_load_matrix_sync","a_frag",col,shmem,\
                                                                                           offset_F+tidx/(32*block_row_warp)*\
                                                                                           (16*warp_col_tile*(16+shieft))+col*(16*(16+shieft)),16+shieft)
                        ib.emit(load_matrix_frag_F)

                    with ib.for_range(0, warp_row_tile, name="row") as row:
                        load_matrix_frag_D = tvm.call_intrin("float32","wmma_load_matrix_sync","b_frag",row,shmem,\
                                                                        offset_D_im2col+tidx%(32*block_row_warp)/32*\
                                                                        (16*warp_row_tile*(16+shieft))+row*(16*(16+shieft)),16+shieft)
                        ib.emit(load_matrix_frag_D)

                    ib.emit(sync)
                    #now compute
                    with ib.for_range(0, warp_col_tile,
                                      name="mma_col") as mma_col:
                        with ib.for_range(0, warp_row_tile,
                                          name="mma_row") as mma_row:
                            wmma_compute = tvm.call_intrin(
                                "float16", "wmma_mma_sync", "c_frag", "a_frag",
                                "b_frag", "c_frag", mma_col, mma_row)
                            ib.emit(wmma_compute)

                    with ib.if_scope(ker_id < 8):
                        #load filer of the next ieration
                        load_filter = tvm.call_extern("float32",
                                                      "load_matrix_F", Fp,
                                                      shmem, offset_F_warp,
                                                      blk_id, K, C, N, H, W,
                                                      c_id * 16 + tidx % 2 * 8,
                                                      ker_id + 1, 64)
                        ib.emit(load_filter)
                        #load data for next iteration
                        im2col = tvm.call_extern("float32", "im2col", shmem,
                                                 offset_D_warp, ker_id + 1, 64)
                        ib.emit(im2col)
                    ib.emit(sync)

                with ib.if_scope(c_id < C / 16 - 1):
                    #load the next 9 iteration data from global memory
                    load_data = tvm.call_extern("float32", "load_matrix_D", Dp,
                                                shmem, blk_id, N, H, W, C,
                                                c_id * 16 + 16, 64)
                    ib.emit(load_data)

                    #load filter for next cd iter
                    load_filter = tvm.call_extern(
                        "float32", "load_matrix_F", Fp, shmem, offset_F_warp,
                        blk_id, K, C, N, H, W, c_id * 16 + 16 + tidx % 2 * 8,
                        0, 64)
                    ib.emit(load_filter)
                    ib.emit(sync)

                    #load the first data from shmem to im2col shmem
                    im2col = tvm.call_extern("float32", "im2col", shmem,
                                             offset_D_warp, 0, 64)
                    ib.emit(im2col)
                    ib.emit(sync)
            #now start reload back to output
            #load fragment to shared memory first
            with ib.for_range(0, warp_col_tile, name="col_id_st") as col_id_st:
                with ib.for_range(0, warp_row_tile,
                                  name="row_id_st") as row_id_st:
                    store_O_fragment = tvm.call_intrin(
                        "float32", "wmma_store_matrix_sync", shmem,
                        warp_offset_output + col_id_st *
                        (256 * warp_row_tile * block_row_warp) +
                        row_id_st * 16, "c_frag", col_id_st, row_id_st, 64)
                    ib.emit(store_O_fragment)
            ib.emit(sync)

            Op = O.access_ptr("w")
            store_O = tvm.call_extern("float32", "store_output", Op, shmem,
                                      blk_id, N, P, Q, K, 64)
            ib.emit(store_O)
            ib.emit(sync)
    body = ib.get()
    return (body)
コード例 #13
0
def my_log(x):
    # 通过内建函数构建表达式 my_log:内建函数名
    return tvm.call_intrin(x.dtype, 'my_log', x)