def test_multiple_kernels():
    N = 1024

    A = tvm.placeholder((N, N), name='A')
    B = tvm.compute((N, N), lambda i, j: A[i, j])
    C = tvm.compute((N, N), lambda i, j: B[i, j])

    s = tvm.create_schedule([C.op])

    s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x"))
    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))

    # shared memory usage: 0
    # thread usage: N

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N - 1))]}):
            tvm.build(s, [A, C], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N))]}):
            tvm.build(s, [A, C], target)
        assert valid[0]
def test_local_memory():
    N = 1024
    M = 128

    A = tvm.placeholder((N,), name='A', dtype='float32')
    B = tvm.compute((N, ), lambda i: A[i], name='B')

    s = tvm.create_schedule([B.op])
    AA = s.cache_read(A, "local", [B])
    o, i = s[B].split(s[B].op.axis[0], M)
    s[AA].compute_at(s[B], o)
    s[B].bind(o, tvm.thread_axis("blockIdx.x"))

    # local memory usage: M * 4B
    # thread usage: M

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_local_memory_per_block=4 * M - 1,
                                max_threads_per_block=1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_local_memory_per_block=4 * M,
                                max_threads_per_block=1))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]
def test_num_thread():
    N = 1024
    M = 128

    A = tvm.placeholder((N,), name='A', dtype='float32')
    B = tvm.compute((N, ), lambda i: A[i], name='B')

    s = tvm.create_schedule([B.op])
    o, i = s[B].split(s[B].op.axis[0], M)

    s[B].bind(o, tvm.thread_axis('threadIdx.x'))
    s[B].bind(i, tvm.thread_axis("threadIdx.y"))

    # shared memory usage: 0
    # thread usage: N

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N - 1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N,
                                max_thread_y=M-1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N,
                                max_thread_y=M))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]
def intrin_vadd(n, cache_read=False, cache_write=False):
    scope_ubuf = 'local'
    dtype = 'float32'
    x = tvm.placeholder((n,), dtype=dtype, name='vx')
    y = tvm.placeholder((n,), dtype=dtype, name='vy')
    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
    s = tvm.create_schedule(z.op)

    def create_buffer(t):
        return tvm.decl_buffer(t.shape, t.dtype,
                               name='W'+t.name,
                               scope=scope_ubuf,
                               offset_factor=16)

    binds = {}
    if cache_read:
        binds[x] = create_buffer(x)
        binds[y] = create_buffer(y)
    if cache_write:
        binds[z] = create_buffer(z)

    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
        return ib.get()

    with tvm.build_config(offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
Exemple #5
0
def intrin_gemv(m, l):
    a = tvm.placeholder((l,), name='a')
    b = tvm.placeholder((m, l), name='b')
    k = tvm.reduce_axis((0, l), name='k')
    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
    Ab = tvm.decl_buffer(a.shape, a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[1])
    Bb = tvm.decl_buffer(b.shape, b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[tvm.var("s1"), 1])
    Cb = tvm.decl_buffer(c.shape, c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[1])
    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(tvm.call_extern("int32", "gemv_update",
                                cc.access_ptr("w"),
                                aa.access_ptr("r"),
                                bb.access_ptr("r"),
                                m, l, bb.strides[0]))
        return ib.get()
    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def test_double_splitting_with_indivisible_factors():
    m = 48
    dtype="float32"
    A = tvm.placeholder((m,), name='A', dtype=dtype)
    C = tvm.compute((m,), lambda i: A[i], name='C')
    D = tvm.compute((m,), lambda i: C[i], name='D')

    s = tvm.create_schedule(D.op)
    co, ci = s[C].split(C.op.axis[0], factor=10)
    do, di = s[D].split(D.op.axis[0], 32)
    s[C].compute_at(s[D], do)

    target = 'llvm'
    with tvm.build_config(partition_const_loop=True):
        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
        func = tvm.build(f, target=target)

    # Find the beginning of the Halide IR corresponding to kernel code
    # and make sure it doesn't have an if statements left
    top_produce = find_top_produce(f.body)
    assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse))))

    # check functional correctness of generated code
    ctx = tvm.context(target, 0)
    a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx)
    c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    func(a, c, d)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
    tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
Exemple #7
0
def test_llvm_madd_pipeline():
    def check_llvm(nn, base, stride):
        if not tvm.module.enabled("llvm"):
            return
        n = tvm.convert(nn)
        A = tvm.placeholder((n + base, stride), name='A')
        C = tvm.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C')
        s = tvm.create_schedule(C.op)
        xo, xi = s[C].split(C.op.axis[0], factor=4)
        s[C].parallel(xo)
        s[C].vectorize(xi)
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy()[base:] + 1)
    check_llvm(64, 0, 2)
    check_llvm(4, 0, 1)
    with tvm.build_config(restricted_func=False):
        check_llvm(4, 0, 3)
def intrin_gemv(m, n):
    w = tvm.placeholder((m, n), name='w')
    x = tvm.placeholder((n,), name='x')
    k = tvm.reduce_axis((0, n), name='k')
    z = tvm.compute((m,), lambda i:
                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
    Wb = tvm.decl_buffer(w.shape, w.dtype,
                         name="W",
                         offset_factor=16,
                         strides=[tvm.var('ldw'), 1])
    def intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        ww_ptr = ww.access_ptr("r")
        xx_ptr = xx.access_ptr("r")
        zz_ptr = zz.access_ptr("w")
        body = tvm.call_packed(
            "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        reset = tvm.call_packed(
            "fill_zero", zz_ptr, n)
        update = tvm.call_packed(
            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        return body, reset, update

    with tvm.build_config(data_alignment=16,
                          offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func,
                                      binds={w: Wb})
def test_fold_const():
    c_data = np.array([1, 2, 3]).astype("float32")
    def before():
        c = relay.const(c_data)
        x = relay.var("x")
        y = relay.add(c, c)
        y = relay.multiply(y, relay.const(2, "float32"))
        y = relay.add(x, y)
        z = relay.add(y, c)
        return relay.Function([x], z)

    def expected():
        x = relay.var("x")
        c_folded = (c_data + c_data) * 2
        y = relay.add(x, relay.const(c_folded))
        z = relay.add(y, relay.const(c_data))
        return relay.Function([x], z)

    def fail(x):
        raise RuntimeError()
    # the fold constant should work on any context.
    with tvm.build_config(add_lower_pass=[(0, fail)]):
        with tvm.target.create("cuda"):
            zz = relay.ir_pass.fold_constant(before())
    zexpected = expected()
    assert relay.ir_pass.alpha_equal(zz, zexpected)
Exemple #10
0
def dp4a(x_scope='local', y_scope='local', z_scope='local'):
    """
    Int8 dot product reduced by every 4 elements using __dp4a

    Parameters
    ----------
    x_scope : str, optional
        The storage scope of buffer for lhs
    y_scope : str, optional
        The storage scope of buffer for rhs
    z_scope : str, optional
        The storage scope of buffer for result

    Returns
    -------
    intrin : TensorIntrin
        The dp4a TensorIntrin that can be used in tensorizing schedule.
    """

    n = 4  # dp4a requires operands packed by 4
    x = tvm.placeholder((n,), name='x', dtype='int8')
    y = tvm.placeholder((n,), name='y', dtype='int8')

    k = tvm.reduce_axis((0, n), name='rc')

    z = tvm.compute((1,), lambda i: tvm.sum(
        x[k].astype('int32') * y[k].astype('int32'), axis=[k]))

    def _intrin_func(ins, outs):
        def _instr(index):
            xx, yy = ins
            zz = outs[0]

            if index == 1:
                return zz.vstore(0, 0)

            ib = tvm.ir_builder.create()

            vec_x = xx.vload(0, dtype='int8x4')
            vec_y = yy.vload(0, dtype='int8x4')
            prev_z = 0 if index == 0 else zz.vload(0)

            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
            ib.emit(zz.vstore(0, new_z))

            return ib.get()

        return _instr(0), _instr(1), _instr(2) # body, reset, update

    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
        scopes = {x: x_scope, y: y_scope, z: z_scope}
        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
                                    data_alignment=cfg.data_alignment,
                                    offset_factor=cfg.offset_factor,
                                    scope=scopes[t]) for t in [x, y, z]}

        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--target', type=str, required=True,
                        choices=['cuda', 'rocm', 'opencl', 'metal'],
                        help="Compilation target.")
    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
    parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
    args = parser.parse_args()
    opt_level = args.opt_level
    num_iter = args.num_iter
    ctx = tvm.context(args.target, 0)
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size,) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(
            batch_size=1, image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    if args.target == "cuda":
        unroll = 1400
    else:
        unroll = 128
    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.build_config(auto_unroll_max_step=unroll,
                              unroll_explicit=(args.target != "cuda")):
            graph, lib, params = nnvm.compiler.build(
                net, args.target, shape={"data": data_shape}, params=params)

    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(args.repeat):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding device overheat
        if i + 1 != args.repeat:
            time.sleep(45)
Exemple #12
0
def intrin_vadd(n):
    x = tvm.placeholder((n,), name='vx')
    y = tvm.placeholder((n,), name='vy')
    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
    def intrin_func(ins, outs):
        xx, yy = ins
        zz = outs[0]
        return tvm.call_packed("vadd", xx, yy, zz)
    with tvm.build_config(offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func)
Exemple #13
0
    def op_intrin():
        bh = 9
        bw = 9
        x = tvm.placeholder((5, 5), name='A')
        y = tvm.compute((bh, bw), lambda i,j: x[j/3 + i%3, j%3+ i/3])

        def intrin_func(ins, outs):
            xx, = ins
            zz = outs[0]
            return tvm.call_packed("op", xx, zz)

        with tvm.build_config(offset_factor=2):
            return tvm.decl_tensor_intrin(y.op, intrin_func)
Exemple #14
0
def test_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    xo1, xo2 = s[C].split(xo, factor=13)
    s[C].parallel(xo2)
    s[C].pragma(xo1, "parallel_launch_point")
    s[C].pragma(xo2, "parallel_stride_pattern")
    s[C].pragma(xo2, "parallel_barrier_when_finish")
    s[C].vectorize(xi)

    def check_c():
        if not tvm.module.enabled("llvm"):
            return
        # Specifically allow offset to test codepath when offset is available
        Ab = tvm.decl_buffer(
            A.shape, A.dtype,
            elem_offset=tvm.var('Aoffset'),
            offset_factor=8,
            name='A')
        binds = {A : Ab}
        # BUILD and invoke the kernel.
        f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
        mhost = tvm.codegen.build_module(fsplits[0], "c")
        temp = util.tempdir()
        path_dso = temp.relpath("temp.so")
        mhost.export_library(path_dso)
        m = tvm.module.load(path_dso)
        fadd = m["fadd_pipeline"]
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())

    with tvm.build_config(offset_factor=4):
        check_c()
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.context(device, 0)
     a = tvm.nd.array(a_np, ctx)
     w = tvm.nd.array(w_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
     with tvm.build_config(auto_unroll_max_step=128,
                           unroll_explicit=device == 'rocm'):
         func1 = tvm.build(s1, [A, W, B], device)
         func1(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         func2 = tvm.build(s2, [A, W, C], device)
         func2(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
Exemple #16
0
def build_config(debug_flag=0, **kwargs):
    """Build a build config for VTA.

    Parameters
    ----------
    debug_flag : int
        The dbeug flag to be passed.

    kwargs : dict
        Additional configurations.

    Returns
    -------
    build_config: BuildConfig
        The build config that can be used in TVM.

    Example
    --------
    .. code-block:: python

      # build a vta module.
      with vta.build_config():
          vta_module = tvm.build(s, ...)
    """
    env = get_env()
    def add_debug(stmt):
        debug = tvm.call_extern(
            "int32", "VTASetDebugMode",
            env.dev.command_handle,
            debug_flag)

        return tvm.make.stmt_seq(debug, stmt)
    pass_list = [(1, ir_pass.inject_dma_intrin),
                 (1, ir_pass.inject_skip_copy),
                 (1, ir_pass.annotate_alu_coproc_scope),
                 (1, lambda x: tvm.ir_pass.LiftAttrScope(x, "coproc_uop_scope", True)),
                 (1, lift_coproc_scope),
                 (1, ir_pass.inject_coproc_sync),
                 (1, early_rewrite)]
    if debug_flag:
        pass_list.append((1, add_debug))
    pass_list.append((2, ir_pass.inject_alu_intrin))
    pass_list.append((3, ir_pass.fold_uop_loop))
    pass_list.append((3, ir_pass.cpu_access_rewrite))
    return tvm.build_config(add_lower_pass=pass_list, **kwargs)
def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
        n = 21
        A = tvm.placeholder((n, ), name='A')
        B = tvm.placeholder((n, ), name='B')

        T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
        s = tvm.create_schedule(T.op)
        xo, xi = s[T].split(T.op.axis[0], factor=4)
        lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
        print (lowered_func.body)
        ctx = tvm.cpu(0)

        f = tvm.build(s, [A, B, T], "llvm")
        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
        t = tvm.nd.empty((n,), T.dtype, ctx)
        f(a, b, t)
    def intrin_test():
      m1 = tvm.var("m1")
      n1 = tvm.var("n1")
      a = tvm.placeholder((m1, n1), name='a')
      c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c')

      Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1)
      Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1)

      def intrin_func(ins, outs):
        aa = ins[0]
        cc = outs[0]
        def _body():
          ib = tvm.ir_builder.create()
          ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r")))
          return ib.get()
        return _body()
      with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})
Exemple #19
0
def test_llvm_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    xo1, xo2 = s[C].split(xo, factor=13)
    s[C].parallel(xo2)
    s[C].pragma(xo1, "parallel_launch_point")
    s[C].pragma(xo2, "parallel_stride_pattern")
    s[C].pragma(xo2, "parallel_barrier_when_finish")
    s[C].vectorize(xi)

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # Specifically allow offset to test codepath when offset is available
        Ab = tvm.decl_buffer(
            A.shape, A.dtype,
            elem_offset=tvm.var('Aoffset'),
            offset_factor=8,
            name='A')
        binds = {A : Ab}
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())

    with tvm.build_config(offset_factor=4):
        check_llvm()
Exemple #20
0
 def check_llvm(n):
     if not tvm.module.enabled("llvm"):
         return
     with tvm.build_config(instrument_bound_checkers=True):
         A = tvm.placeholder((n, ), name='A')
         scale = tvm.placeholder((), name='scale')
         k = tvm.reduce_axis((0, n), name="k")
         C = tvm.compute((), lambda : tvm.sum(A[k] * scale, axis=k), name="C")
         D = tvm.compute((), lambda : C + 1)
         s = tvm.create_schedule(D.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, scale, D], "llvm")
         ctx = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
         sc = tvm.nd.array(
             np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
         d = tvm.nd.empty((), D.dtype, ctx)
         f(a, sc, d)
         d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
         tvm.testing.assert_allclose(d.asnumpy(), d_np)
def test_wrong_bind():
    N = 1024

    A = tvm.placeholder((N, N-1), name='A')
    B = tvm.compute((N, N-1), lambda i, j: A[i, j])

    s = tvm.create_schedule([B.op])

    # bind a thread axis to two loop axes with different lengths
    s[B].bind(s[B].op.axis[0], tvm.thread_axis("threadIdx.x"))
    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
                (2, get_verify_pass(valid, max_threads_per_block=N*N))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]
 def check_device(device):
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         print("Skip because %s is not enabled" % device)
         return
     print("Running on target: %s" % device)
     with tvm.target.create(device):
         s1 = topi.generic.schedule_conv2d_nchw([B])
         s2 = topi.generic.schedule_conv2d_nchw([C])
     a = tvm.nd.array(a_np, ctx)
     w = tvm.nd.array(w_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
     with tvm.build_config(auto_unroll_max_step=1400,
                           unroll_explicit=(device != "cuda")):
         func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
         func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
         func1(a, w, b)
         func2(a, w, c)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
Exemple #23
0
def initialize_variables(ishape, idtype):
    """ Initialize variables stored in _all_var_init dictionary.

    Parameters
    ----------
    ishape : dict of str to tuple of int
        The input shape to the graph

    idtype : str or dict of str to str
        The input types to the graph

    Returns
    -------
    init_var : dict of str to tvm.ndarray
    """
    symbol_init_dict = {}
    const_init_dict = {}
    init_var = {}
    for key, value in _all_var_init.items():
        if isinstance(value, sym.Symbol):
            symbol_init_dict[key] = value
        else:
            const_init_dict[key] = tvm.nd.array(value)
    # Make sure variables are initialized only once.
    _all_var_init.clear()
    if symbol_init_dict:
        # Create dummy params to run initialization graph
        params = {}
        for name, shape in ishape.items():
            dtype = idtype if isinstance(idtype, str) else idtype[name]
            params[name] = tvm.nd.empty(shape, dtype, ctx=tvm.cpu())
        init_group_sym = sym.Group(symbol_init_dict.values())
        graph = _graph.create(init_group_sym)
        with tvm.build_config(auto_unroll_max_step=0):
            init_values = _run_graph(graph, params)
        init_var.update(dict(zip(symbol_init_dict.keys(), init_values)))
    init_var.update(const_init_dict)
    for name, data in init_var.items():
        ishape[name] = data.shape
    return init_var
Exemple #24
0
    def intrin_multivadd(n):
        n_a = tvm.var("n_a")
        Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a])

        n_b = tvm.var("n_b")
        Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b])

        n_c = tvm.var("n_c")
        Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c])

        z = tvm.compute((n,), lambda i: tvm.call_extern("float32", 'vadd',
                                                        Ab.access_ptr("w", offset=n_a*i),
                                                        Bb.access_ptr("r", offset=n_b*i),
                                                        Cb.access_ptr("r", offset=n_c*i)))

        # replace the pattern with the multivadd call. I need to figure out
        # how to pass it the right parameters.
        def intrin_func(ins, outs):
            return tvm.call_packed("multivadd")

        with tvm.build_config():
            return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
Exemple #25
0
    def intrin_vadd(n):
        dtype = 'float32'
        x = tvm.placeholder((n,), dtype=dtype, name='vx')
        y = tvm.placeholder((n,), dtype=dtype, name='vy')
        z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
        s = tvm.create_schedule(z.op)

        def create_buffer(t):
            return tvm.decl_buffer(t.shape, t.dtype,
                                   name='W'+t.name,
                                   offset_factor=16)

        def intrin_func(ins, outs):
            ib = tvm.ir_builder.create()
            ib.emit(tvm.call_extern("float32", 'vadd',
                                    ins[0].access_ptr("r"), ins[1].access_ptr('r'),
                                    outs[0].access_ptr('wr')))
            return ib.get()

        with tvm.build_config(offset_factor=16):
            return tvm.decl_tensor_intrin(z.op, intrin_func, binds={x: create_buffer(x),
                                                                    y: create_buffer(y),
                                                                    z: create_buffer(z)})
Exemple #26
0
    def check_device(target):
        with tvm.build_config(
                detect_global_barrier=detect_global_barrier,
                auto_unroll_max_step=128,
                unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden//2:] = 0

        res_a = tvm.nd.array(res_np, ctx)
        Whh_a = tvm.nd.array(Whh_np, ctx)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        ctx.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        ctx.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i  in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5:
                        print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j]))
            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
Exemple #27
0
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
    global TASK
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A")
    if type == "sum":
        TASK = "sum_map_id%d" %test_id
        B = topi.sum(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        TASK = "max_map_id%d" %test_id
        B = topi.max(A, axis=axis, keepdims=keepdims)
    elif type == "min":
        TASK = "min_map_id%d" %test_id
        B = topi.min(A, axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_reduce(B)
    with tvm.build_config(auto_unroll_max_step=16,
                          auto_unroll_min_depth=0):
        fcuda = tvm.build(s, [A, B], "cuda", name="sum")

    # Test
    in_npy = np.random.normal(size=in_shape).astype(np.float32)
    if type == "sum":
        out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
    elif type == "max":
        out_npy = in_npy.max(axis=axis, keepdims=keepdims)
    elif type == "min":
        out_npy = in_npy.min(axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError

    data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
    out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())

    for _ in range(2):
        fcuda(data_tvm, out_tvm)
    tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
Exemple #28
0
def precompute_prune(graph, params):
    """Precompute the part of graph that can be pre-computed.

    This will create a new graph that only contains the ops
    that need to be computed depending on input as well as
    updated version of param dict that pre-computes some of
    intermediate results.

    Parameters
    ----------
    graph : Graph
        The input graph

    params : dict of str -> tvm.NDArray
        The parameter dictionary of the graph

    Returns
    -------
    pruned_graph : Graph
        The pruned graph

    new_params : dict of str-> tvm.NDArray
        The updated dictionary of parameters.
    """
    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
    graph._set_json_attr("param_name_list", list(params.keys()), "list_str")
    graph = graph.apply("PrecomputePrune")
    pre_graph = graph_attr._move_out_graph(graph, "precompute_graph")
    if pre_graph is None:
        return graph, params
    out_names = pre_graph.json_attr("output_names")
    if not pre_graph.symbol.list_output_names():
        return graph, params
    with tvm.build_config(auto_unroll_max_step=0):
        out_arrs = _run_graph(pre_graph, params)
    return graph, dict(zip(out_names, out_arrs))
def test_num_thread():
    N = 1024
    M = 128

    A = tvm.placeholder((N, ), name='A', dtype='float32')
    B = tvm.compute((N, ), lambda i: A[i], name='B')

    s = tvm.create_schedule([B.op])
    o, i = s[B].split(s[B].op.axis[0], M)

    s[B].bind(o, tvm.thread_axis('threadIdx.x'))
    s[B].bind(i, tvm.thread_axis("threadIdx.y"))

    # shared memory usage: 0
    # thread usage: N

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(
                **{
                    "add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N - 1))]
                }):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(
                **{
                    "add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N))]
                }):
            tvm.build(s, [A, B], target)
        assert valid[0]

        with tvm.build_config(
                **{
                    "add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N,
                                        max_thread_y=M - 1))]
                }):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(
                **{
                    "add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N,
                                        max_thread_y=M))]
                }):
            tvm.build(s, [A, B], target)
        assert valid[0]
Exemple #30
0
if "vta" in target:
    sym = vta.graph.pack(sym, shape_dict, factor)

graph_attr.set_shape_inputs(sym, shape_dict)
sym = sym.apply("InferShape")
graph_attr.set_dtype_inputs(sym, dtype_dict)
sym = sym.apply("InferType")
timers['execution_time_prepare_graph'] = time.time() - dt

with nnvm.compiler.build_config(opt_level=3):
    bdict = {}
    if "vta" not in target:
        bdict = {"add_lower_pass": []}
    else:
        bdict = {"add_lower_pass": vta.debug_mode(0)}
    with tvm.build_config(**bdict):
        graph, lib, params = nnvm.compiler.build(sym,
                                                 target,
                                                 shape_dict,
                                                 dtype_dict,
                                                 params=params)

print("connecting ...")
dt = time.time()
remote = rpc.connect(host, port)
temp = util.tempdir()
lib.save(temp.relpath("graphlib.o"))
remote.upload(temp.relpath("graphlib.o"))
timers['execution_time_upload_graph'] = time.time() - dt
lib = remote.load_module("graphlib.o")
ctx = remote.ext_dev(0) if "vta" in target else remote.cpu(0)
Exemple #31
0
def lstm():
    if not PERSIST_KERNEL:
        raise ValueError("Non persist LSTM not yet supported")
    num_thread_y = 8
    num_thread_x = 16 * 3 // 2
    num_sm = 24
    n_num_step = 128
    num_step = tvm.te.var('num_step')
    num_hidden = 1152 // 2
    batch_size = 1
    # Global transition matrix
    # Input hidden channel can be pre-caculated by a gemm
    Xi2h = tvm.te.placeholder((num_step, batch_size, 4, num_hidden),
                              name="Xi2h")
    # Only handle hidden transition, saves space.
    Wh2h = tvm.te.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.te.placeholder((num_step, batch_size, num_hidden))
    s_state_c = tvm.te.placeholder((num_step, batch_size, num_hidden))
    s_init_c = tvm.te.compute((1, batch_size, num_hidden),
                              lambda *i: 0.0,
                              name="init_c")
    s_init_h = tvm.te.compute((1, batch_size, num_hidden),
                              lambda *i: 0.0,
                              name="init_h")
    # LSTM transition
    k = tvm.te.reduce_axis((0, num_hidden), name="ki2h")
    s_h2h = tvm.te.compute(
        (num_step, batch_size, 4, num_hidden),
        lambda t, i, x, j: tvm.te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k],
                                      axis=k),
        name="s_h2h")
    # Gate rules
    gates = tvm.te.compute(Xi2h.shape,
                           lambda *i: Xi2h(*i) + s_h2h(*i),
                           name="gates")
    gshape = (num_step, batch_size, num_hidden)
    in_gate = tvm.te.compute(gshape,
                             lambda t, i, j: tvm.te.sigmoid(gates[t, i, 0, j]),
                             name="in_gate")
    in_transform = tvm.te.compute(
        gshape,
        lambda t, i, j: tvm.te.tanh(gates[t, i, 1, j]),
        name="in_transform")
    forget_gate = tvm.te.compute(
        gshape,
        lambda t, i, j: tvm.te.sigmoid(gates[t, i, 2, j]),
        name="forget_gate")
    out_gate = tvm.te.compute(
        gshape,
        lambda t, i, j: tvm.te.sigmoid(gates[t, i, 3, j]),
        name="out_gate")
    next_c = tvm.te.compute(
        gshape,
        lambda t, i, j: forget_gate[t, i, j] * s_state_c[
            t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j],
        name="next_c")
    next_h = tvm.te.compute(
        gshape,
        lambda t, i, j: out_gate[t, i, j] * tvm.te.tanh(next_c[t, i, j]),
        name="next_h")
    update_c = tvm.te.compute(gshape, lambda *i: next_c(*i), name="update_c")
    update_h = tvm.te.compute(gshape, lambda *i: next_h(*i), name="update_h")
    # schedule
    scan_h, scan_c = tvm.te.scan([s_init_h, s_init_c], [update_h, update_c],
                                 [s_state_h, s_state_c],
                                 inputs=[Xi2h],
                                 name="lstm_scan")
    # schedule
    s = tvm.te.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()

    block_x = tvm.te.thread_axis((0, num_sm), "blockIdx.x")
    thread_x = tvm.te.thread_axis((0, num_thread_x), "threadIdx.x")
    thread_y = tvm.te.thread_axis((0, num_thread_y), "threadIdx.y")

    s_state_h_S = s.cache_read(s_state_h, "shared", [s_h2h])
    print(s[s_state_h_S].op.axis, s[s_state_h_S].op.reduce_axis)
    s_state_c_S = s.cache_read(s_state_c, "shared", [next_c])
    Wh2hL = s.cache_read(Wh2h, "local", [s_h2h])

    ko, ki = s[s_h2h].split(s[s_h2h].op.reduce_axis[0], nparts=num_thread_y)
    s_h2h_rf = s.rfactor(s_h2h, ko)
    print(s[s_h2h_rf].op.axis, s[s_h2h_rf].op.reduce_axis)
    print(s[s_h2h].op.axis, s[s_h2h].op.reduce_axis)
    print(s[s_h2h_rf].op.input_tensors)
    s[s_h2h].bind(s[s_h2h].op.reduce_axis[0], thread_y)
    s[s_h2h_rf].compute_at(s[s_h2h], s[s_h2h].op.reduce_axis[0])

    if PERSIST_KERNEL:
        s[scan_h.op].env_threads([block_x, thread_y, thread_x])
        s[Wh2hL].compute_at(s[scan_h.op], thread_x)
    else:
        s[Wh2hL].compute_at(s[s_h2h], s[s_h2h].op.axis[3])

    if UNROLL_WLOAD:
        s[Wh2hL].unroll(Wh2hL.op.axis[0])
        s[Wh2hL].unroll(Wh2hL.op.axis[2])

    s[s_state_h_S].compute_at(s[s_h2h_rf], s[s_h2h_rf].op.axis[3])
    s[s_state_c_S].compute_at(s[scan_h.op], s[scan_h].op.scan_axis)
    print(s[s_state_h_S].op.axis, s[s_state_h_S].op.reduce_axis)

    for ss in [s_state_h_S]:
        xo, xi = s[ss].split(ss.op.axis[2], factor=num_thread_x * num_thread_y)
        ty, xi = s[ss].split(xi, nparts=num_thread_y)
        tx, xi = s[ss].split(xi, nparts=num_thread_x)
        s[ss].bind(ty, thread_y)
        s[ss].bind(tx, thread_x)

    for init in [s_init_c, s_init_h]:
        bx, xi = s[init].split(init.op.axis[2], nparts=num_sm)
        tx, xi = s[init].split(xi, nparts=num_thread_x)
        s[init].bind(bx, block_x)
        s[init].bind(tx, thread_x)

    # s[next_c].set_store_predicate(thread_y.equal(0))
    # s[next_h].set_store_predicate(thread_y.equal(0))

    for update in [update_c, update_h]:
        bx, xi = s[update].split(s[update].op.axis[2], nparts=num_sm)
        tx, xi = s[update].split(xi, nparts=num_thread_x)
        s[update].bind(bx, block_x)
        s[update].bind(tx, thread_x)
        # s[update].set_store_predicate(thread_y.equal(0))

    # verify we can lower correctly
    def check_device(target):
        num_step = n_num_step
        print(tvm.lower(s, [Xi2h, Wh2h, scan_h, scan_c], simple_mode=True))
        flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        scan_h_np = np.zeros(
            (num_step, batch_size, num_hidden)).astype("float32")
        scan_c_np = np.zeros(
            (num_step, batch_size, num_hidden)).astype("float32")
        Xi2h_np = np.random.normal(size=(num_step, batch_size, 4,
                                         num_hidden)).astype("float32")
        Wh2h_np = np.random.normal(size=(4, num_hidden,
                                         num_hidden)).astype("float32")
        scan_h_a = tvm.nd.array(scan_h_np, ctx)
        scan_c_a = tvm.nd.array(scan_c_np, ctx)
        Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
        Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
        ctx.sync()
        # measure time cost of second step.
        evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
        eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
        print("Time cost=%g" % eval_result.mean)

    # set unroll_explicit for more readable code.
    with tvm.build_config(detect_global_barrier=DETECT_GLOBAL_BARRIER,
                          auto_unroll_max_step=128,
                          unroll_explicit=False):
        check_device("cuda")
Exemple #32
0
def intrin_col2im(input_shape, output_shape, kernel, stride, pad, dtype):
    '''
    Compute col2im via cce col2im intrin function call directly

    Args:
        input_shape: the shape of the image
        output_shape: the shape of the result of im2col given the input image
        kernel: kernel sizes for im2col
        stride: stride sizes for im2col
        pad: padding sizes for im2col, including padding top, bottom, left, and right
        dtype: type of the data

    Return:
        cce intrin function call for col2im
    '''

    _, _, _, _, WINDOW_H, WINDOW_W, _ = input_shape
    _, _, H, W, _ = output_shape
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_t, pad_b, pad_l, pad_r = pad

    assert (
        WINDOW_H * WINDOW_W
    ) % 16 == 0, "Number of windows over the input must be divisible by 16 (col2im repeat)"
    assert (
        H * W *
        16) % 64 == 0, "Input size must be divisible by 64 (vector_dup repeat)"

    # FCOL2IMG -------------------------------------------
    INPUT_W = W
    INPUT_H = H

    PAD_LEFT = pad_l
    PAD_RIGHT = pad_r
    PAD_TOP = pad_t
    PAD_BOTTOM = pad_b
    # ---------------------------------------------------
    # Xm ------------------------------------------------
    W_IDX_KERNEL = 0
    H_IDX_KERNEL = 0

    H_IDX = (-pad_l) & 0xFFFF  # fix negative numbers
    W_IDX = (-pad_t) & 0xFFFF

    C1_IDX = 0
    # ---------------------------------------------------
    # Xt ------------------------------------------------
    STRIDE_H = stride_h
    STRIDE_W = stride_w

    KERNEL_H = kernel_h
    KERNEL_W = kernel_w

    DILATION_H = 1
    DILATION_W = 1

    JUMP_OFFSET = 0
    REPEAT_MODE = 1
    REPEAT_TIME = (WINDOW_H * WINDOW_W) // 16
    # ---------------------------------------------------

    INPUT_B = 1
    INPUT_C1 = 1
    INPUT_C0 = 16

    input_data = tvm.placeholder(
        (INPUT_B, INPUT_C1, KERNEL_H, KERNEL_W, WINDOW_H, WINDOW_W, INPUT_C0),
        dtype=dtype)

    result = tvm.compute(
        (INPUT_B, INPUT_C1, INPUT_H, INPUT_W, INPUT_C0),
        lambda b, c1, h, w, c0: input_data[b, c1, h % KERNEL_H, w % KERNEL_W, h
                                           % WINDOW_H, w % WINDOW_W, c0],
        name="col2im_intrinsic",
    )

    input_data_buff = tvm.decl_buffer(input_data.shape,
                                      input_data.dtype,
                                      name="input_data_buff",
                                      offset_factor=1,
                                      scope="local.UB")

    result_buff = tvm.decl_buffer(result.shape,
                                  result.dtype,
                                  name="result_buff",
                                  offset_factor=1,
                                  scope="local.UB")

    def pack_args(sp):
        assert len(sp) == 20
        fcol2img = (akg.tvm.const(sp[0], "uint64") +
                    akg.tvm.const(sp[1] * 2**16, "uint64") +
                    akg.tvm.const(sp[2] * 2**32, "uint64") +
                    akg.tvm.const(sp[3] * 2**40, "uint64") +
                    akg.tvm.const(sp[4] * 2**48, "uint64") +
                    akg.tvm.const(sp[5] * 2**56, "uint64"))

        Xm = (akg.tvm.const(sp[6] * 2**16, "uint64") +
              akg.tvm.const(sp[7] * 2**24, "uint64") +
              akg.tvm.const(sp[8] * 2**32, "uint64") +
              akg.tvm.const(sp[9] * 2**48, "uint64") +
              akg.tvm.const(sp[10], "uint64"))

        Xt = (akg.tvm.const(sp[11], "uint64") +
              akg.tvm.const(sp[12] * 2**6, "uint64") +
              akg.tvm.const(sp[13] * 2**12, "uint64") +
              akg.tvm.const(sp[14] * 2**20, "uint64") +
              akg.tvm.const(sp[15] * 2**28, "uint64") +
              akg.tvm.const(sp[16] * 2**36, "uint64") +
              akg.tvm.const(sp[17] * 2**44, "uint64") +
              akg.tvm.const(sp[18] * 2**52, "uint64") +
              akg.tvm.const(sp[19] * 2**56, "uint64"))

        return (fcol2img, Xm, Xt)

    def intrin_func(ins, outs):
        sp = [
            INPUT_W,
            INPUT_H,
            PAD_LEFT,
            PAD_RIGHT,
            PAD_TOP,
            PAD_BOTTOM,  # FMATRIX
            W_IDX_KERNEL,
            H_IDX_KERNEL,
            W_IDX,
            H_IDX,
            C1_IDX,  # Xm
            STRIDE_W,
            STRIDE_H,
            KERNEL_W,
            KERNEL_H,
            DILATION_W,
            DILATION_H,
            JUMP_OFFSET,
            REPEAT_MODE,
            REPEAT_TIME,  # Xt
        ]
        aa = ins[0]
        bb = outs[0]
        ib = tvm.ir_builder.create()
        fcol2img, Xm, Xt = pack_args(sp)
        ib.emit(tvm.call_extern(dtype, "set_fcol2img", fcol2img))
        ib.emit(
            tvm.call_extern(dtype, "vector_dup", bb.access_ptr("w"), 0,
                            (INPUT_H * INPUT_W * 16) // 64, 1, 1, 8, 8))
        c = 0
        for kh in range(KERNEL_H):
            for kw in range(KERNEL_W):
                sp[6] = kw
                sp[7] = kh
                fcol2img, Xm, Xt = pack_args(sp)
                ib.emit(
                    tvm.call_extern(
                        dtype,
                        "col2img",
                        bb.access_ptr("rw"),
                        aa.access_ptr("r",
                                      offset=c * 16 * INPUT_C0 * REPEAT_TIME),
                        Xm,
                        Xt,
                    ))
                c += 1
        return ib.get()

    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(result.op,
                                      intrin_func,
                                      binds={
                                          input_data: input_data_buff,
                                          result: result_buff
                                      })
Exemple #33
0
def dot_16x1x16_int8_int8_int32():
    """
    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
    This function takes two arrays of int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                out[i] = 0;
                for (int k = 0; k < 4; k++){
                    out[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = 16 # 16 int32 lanes in AVX512
    num_int8_elements = 4 # 4 int8 elements in int32
    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
    k = tvm.reduce_axis((0, num_int8_elements), name='k')
    C = tvm.compute((int32_lanes,),
                    lambda i: tvm.sum(data[k].astype('int32') *
                                      kernel[i, k].astype('int32'),
                                      axis=k),
                    name="C")

    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
                               offset_factor=1,
                               strides=[1])
    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
                               offset_factor=1,
                               strides=[tvm.var('ldw'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
            vec_ai32 = re_int32.astype('int32x16')
            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], "int8x64")
            vec_one = tvm.const(1, "int16x32")
            pair_reduction = tvm.call_llvm_intrin('int16x32',
                                                  'llvm.x86.avx512.pmaddubs.w.512',
                                                  tvm.const(0, 'uint32'),
                                                  vec_a, vec_b)
            quad_reduction = tvm.call_llvm_intrin('int32x16',
                                                  'llvm.x86.avx512.pmaddw.d.512',
                                                  tvm.const(0, 'uint32'),
                                                  pair_reduction, vec_one)
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
Exemple #34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--target',
                        type=str,
                        required=True,
                        choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
                        help="Compilation target.")
    parser.add_argument('--opt-level',
                        type=int,
                        default=1,
                        help="Level of optimization.")
    parser.add_argument('--num-iter',
                        type=int,
                        default=1000,
                        help="Number of iteration during benchmark.")
    parser.add_argument('--repeat',
                        type=int,
                        default=1,
                        help="Number of repeative times.")
    args = parser.parse_args()
    opt_level = args.opt_level
    num_iter = args.num_iter
    ctx = tvm.context(args.target, 0)
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size, ) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(batch_size=1,
                                                       image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    if args.target == "cuda":
        unroll = 1400
    else:
        unroll = 128
    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.build_config(auto_unroll_max_step=unroll,
                              unroll_explicit=(args.target != "cuda")):
            graph, lib, params = nnvm.compiler.build(
                net, args.target, shape={"data": data_shape}, params=params)

    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(args.repeat):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding device overheat
        if i + 1 != args.repeat:
            time.sleep(45)
Exemple #35
0
def intrin_libxsmm_tuned(ofmblock, ofw, ifmblock, stride_width, ifw, rco, ifh,
                         r, s, ifh_stride, ifw_stride, in_channel):
    last_input_width_index = (ofw - 1) * stride_width + s - 1
    A = tvm.placeholder((rco, r, s, ifmblock, ofmblock), name='w')
    B = tvm.placeholder((rco, r, last_input_width_index + 1, ifmblock),
                        name='b')
    k = tvm.reduce_axis((0, ifmblock), name='k')
    k_outer = tvm.reduce_axis((0, rco), name='k_outer')
    ry = tvm.reduce_axis((0, r), name='ry')
    rx = tvm.reduce_axis((0, s), name='rx')
    C = tvm.compute((ofw, ofmblock),
                    lambda m, n: tvm.sum(A[k_outer, ry, rx, k, n] * B[
                        k_outer, ry, rx + m * stride_width, k],
                                         axis=[k_outer, ry, rx, k]),
                    name='out')
    s1 = tvm.create_schedule(C.op)
    w, ofm = s1[C].op.axis
    kco, ky, kx, kci = s1[C].op.reduce_axis
    s1[C].reorder(kco, ky, kx, w, ofm, kci)
    xx_ptr = tvm.decl_buffer(A.shape,
                             A.dtype,
                             name="W",
                             offset_factor=1,
                             data_alignment=64)

    yy_ptr = tvm.decl_buffer(
        B.shape,
        B.dtype,
        name="some",
        offset_factor=1,
        strides=[tvm.var("s3"), tvm.var("s2"), ifmblock, 1],
        data_alignment=64)

    zz_ptr = tvm.decl_buffer(C.shape,
                             C.dtype,
                             name="OUT",
                             offset_factor=1,
                             data_alignment=64)

    def intrin_func(ins, outs):
        # tvm call extern is used to interface to libxsmm batch reduce kernel gemm implementation
        # rco*r*s is the number of batches
        init_and_compute = tvm.call_extern ("int32","batch_reduce_kernel_init_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"),\
                                               rco*r*s,ofmblock,ifmblock,r,s,ifh_stride,ifw_stride, ofw, stride_width)
        reset = tvm.call_extern("int32", "batch_reduce_kernel_init",
                                outs[0].access_ptr("w"), ofmblock, ofw)
        body = tvm.call_extern ("int32","batch_reduce_kernel_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"), rco*r*s,ofmblock,\
                                       ifmblock,ofw, stride_width,r,s, ifh_stride,ifw_stride)
        if math.ceil(in_channel / ifmblock) == rco:
            return init_and_compute, None, init_and_compute
        else:
            return init_and_compute, reset, body

    with tvm.build_config(data_alignment=64):
        return tvm.decl_tensor_intrin(C.op,
                                      intrin_func,
                                      name="GEMM",
                                      binds={
                                          A: xx_ptr,
                                          B: yy_ptr,
                                          C: zz_ptr
                                      })
s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
s[ConvF].tensorize(nnf, intrin_wmma_gemm())
print(tvm.lower(s, [A, W, Conv], simple_mode=True))

###############################################################################
# Generate CUDA Kernel
# --------------------
# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
# be able to run on our build server

ctx = tvm.gpu(0)
if nvcc.have_tensorcore(ctx.compute_version):
    with tvm.build_config(auto_unroll_max_step=16):
        func = tvm.build(s, [A, W, Conv], 'cuda')
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))

###############################################################################
# Summary
# This tutorial demonstrates how TVM scheduling primitives can be used to
# call TensorCores on specific GPUs.
Exemple #37
0
def dot_int8_int8_int32(int32_lanes, dtype='uint'):
    """
    Int8 dot product by every 4 elements using ARM v8.2 udot.
    This function takes two arrays of int8 datatype -- data[4] and
    kernel[int32_lanes][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype.
    The pseudo code is as follows.

    .. code-block:: c

        void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){
            for (int i = 0; i < int32_lanes; i++){
                out[i] = 0;
                for (int k = 0; k < 4; k++){
                    out[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in a vector register and
    the data[4] is broadcasted to another vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Parameters
    ----------
    int32_lanes: int
        How many int32/uint32 to produce
    dtype: str, optional, {"uint", "int"}
        Whether it works on unsigned int or signed int

    Returns
    -------
    intrin : TensorIntrin
        The ARM uint8 TensorIntrin that can be used in tensorizing schedule
    """
    num_int8_elements = 4  # 4 int8 elements in int32

    data = tvm.placeholder((num_int8_elements, ),
                           dtype='%s8' % dtype,
                           name='data')
    kernel = tvm.placeholder((int32_lanes, num_int8_elements),
                             dtype='%s8' % dtype,
                             name='kernel')

    k = tvm.reduce_axis((0, num_int8_elements), name='k')
    C = tvm.compute((int32_lanes, ),
                    lambda i: tvm.sum(data[k].astype('%s32' % dtype) * kernel[
                        i, k].astype('%s32' % dtype),
                                      axis=k),
                    name="C")

    a_buffer = tvm.decl_buffer(data.shape,
                               dtype='%s8' % dtype,
                               name="a_buffer",
                               offset_factor=1,
                               strides=[1])
    b_buffer = tvm.decl_buffer(kernel.shape,
                               dtype='%s8' % dtype,
                               name="b_buffer",
                               offset_factor=1,
                               strides=[tvm.var('s'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(
                    0, tvm.const(0, '%s32x%d' % (dtype, int32_lanes))))
                return ib.get()

            dtype_a = '%s8x%d' % (dtype, num_int8_elements)
            dtype_b = '%s8x%d' % (dtype, int32_lanes * num_int8_elements)
            dtype_c = '%s32x%d' % (dtype, int32_lanes)

            a_int8 = ins[0].vload([0], dtype_a)
            re_int32 = tvm.call_pure_intrin('%s32' % dtype, 'reinterpret',
                                            a_int8)
            # broadcast a
            vec_ai32 = re_int32.astype(dtype_c)

            vec_a = tvm.call_pure_intrin(dtype_b, 'reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], dtype_b)
            vec_c = outs[0].vload([0], dtype_c)

            inst = 'udot' if dtype == 'uint' else 'sdot'
            inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % (
                inst, int32_lanes, int32_lanes * num_int8_elements)
            vdot = tvm.call_llvm_intrin(dtype_c, inst, tvm.const(2, 'uint32'),
                                        vec_c, vec_a, vec_b)
            ib.emit(outs[0].vstore(0, vdot))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(C.op,
                                      _intrin_func,
                                      binds={
                                          data: a_buffer,
                                          kernel: b_buffer
                                      })
def tune_and_evaluate(M, N, L, dtype, layout):
    task = autotvm.task.create(test_gemm,
                               args=(N, L, M, dtype, layout),
                               target='cuda')
    print(task.config_space)

    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    measure_option = autotvm.measure_option(
        builder='local', runner=autotvm.LocalRunner(number=5))

    tuner = autotvm.tuner.XGBTuner(task)
    tuner.tune(n_trial=1000,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file('matmul.log')])

    dispatch_context = autotvm.apply_history_best("matmul.log")
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
    with autotvm.apply_history_best('matmul.log'):
        with tvm.target.create("cuda"):
            with tvm.build_config():
                s, arg_bufs = test_gemm(N, L, M, dtype, layout)
                print(tvm.lower(s, arg_bufs, simple_mode=True))
                func = tvm.build(s, arg_bufs)
    dev_module = func.imported_modules[0]
    print(dev_module.get_source())

    # check correctness
    if (layout == "NN"):
        shape_a = (N, L)
        shape_b = (L, M)
    elif (layout == "NT"):
        shape_a = (L, N)
        shape_b = (L, M)
    elif (layout == "TN"):
        shape_a = (N, L)
        shape_b = (M, L)
    elif (layout == "TT"):
        shape_a = (L, N)
        shape_b = (M, L)

    a_np = None
    b_np = None
    c_np = None
    c_np_type = None
    if dtype == 'float16':
        c_np_type = np.float32
        a_np = np.random.uniform(size=shape_a).astype(np.float16)
        b_np = np.random.uniform(size=shape_b).astype(np.float16)
        if (layout == "NN"):
            c_np = np.dot(a_np, b_np)
        elif (layout == "NT"):
            c_np = np.dot(a_np.T, b_np)
        elif (layout == "TN"):
            c_np = np.dot(a_np, b_np.T)
        elif (layout == "TT"):
            c_np = np.dot(a_np.T, b_np.T)
    elif dtype == 'int8':
        c_np_type = np.int32
        a_np = np.random.randint(low=-128, high=127,
                                 size=shape_a).astype(np.int8)
        b_np = np.random.randint(low=-128, high=127,
                                 size=shape_b).astype(np.int8)
        if (layout == "NN"):
            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
        elif (layout == "NT"):
            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
        elif (layout == "TN"):
            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
        elif (layout == "TT"):
            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)

    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    b_tvm = tvm.nd.array(b_np, ctx=ctx)
    func(a_tvm, b_tvm, c_tvm)

    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)

    evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
    print('Time cost of this operator: %f' %
          evaluator(a_tvm, b_tvm, c_tvm).mean)
Exemple #39
0
#####################################################################
# In TVM, there is a property called ``BuildConfig``. You can use this property to customize your
# own lowering options. In this case, we inject the pass written above into the TVM standard lowering
# pass by feeding **a list of tuple** as argument to ``add_lower_pass``. "Tuple" indicates different
# phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be
# called after each phase is done.
#
# .. note::
#     Here are the essential transformations done by each phase:
#       - Phase 0 generates the raw IR and loop levels.
#       - Phase 1 flattens the array storage.
#       - Phase 2 transforms loops, like unroll, vectorization and thread-binding.
#       - Phase 3 does some cleanup work.
#
# Thus, a good place to put this transformation pass is just after Phase 1.
#

with tvm.build_config(add_lower_pass=[(1, vectorize)]) as cfg:
    print(tvm.lower(sch, [a, b, c], simple_mode=True))

#####################################################################
# Quick View
# ----------
# This tutorial gives a quick view of writing a customized IR transformation pass:
# - Use ``tvm.ir_pass.PostOrderVisit`` to gather information on each IR nodes.
# - Use ``tvm.ir_pass.IRTransform`` to transform IR nodes.
# - Wrap up two above to write an IR-transformation function.
# - Use ``tvm.build_config`` to put this function to TVM lowering pass
#
Exemple #40
0
    def check_device():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel_size, kernel_size),
                            name='W')

        out_dtype = 'float32'
        wkl = _get_workload(A, W, stride, padding, out_dtype)
        sch = Im2ColPack(7, 8, 1, 8, True)

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)

        dtype = A.dtype

        @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()
        # device = 'llvm'
        device = 'llvm -mcpu=skylake-avx512'
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)

        with tvm.build_config(auto_unroll_max_step=1400,
                              unroll_explicit=(device != "cuda")):

            B = _im2col_pack(wkl, sch, A, W, stride, padding, out_dtype)
            s = tvm.create_schedule(B.op)
            traverse(s, B.op)

            op = B.op
            output = op.output(0)
            conv_out = op.input_tensors[0]
            kernel_vec = conv_out.op.input_tensors[1]
            kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data_col = data_vec.op.input_tensors[0]
            data = data_col.op.input_tensors[0]
            data_pad = None
            if isinstance(data.op,
                          tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]
            _schedule_im2col_conv2d(wkl, sch, s, data, data_pad, data_col,
                                    data_vec, kernel, kernel_vec, conv_out,
                                    output, B)

            print(tvm.lower(s, [A, W, B], simple_mode=True))

            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                             ctx)
            func = tvm.build(s, [A, W, B], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a, w, b).mean
            print('conv: %g secs/op' % cost)

            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            print(b_np.shape)
Exemple #41
0
def dot_16x1x16_int8_int8_int32():
    """
    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
    This function takes two arrays of int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                out[i] = 0;
                for (int k = 0; k < 4; k++){
                    out[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = 16  # 16 int32 lanes in AVX512
    num_int8_elements = 4  # 4 int8 elements in int32
    data = tvm.placeholder((num_int8_elements, ), dtype='uint8', name='data')
    kernel = tvm.placeholder((int32_lanes, num_int8_elements),
                             dtype='int8',
                             name='kernel')
    k = tvm.reduce_axis((0, num_int8_elements), name='k')
    C = tvm.compute(
        (int32_lanes, ),
        lambda i: tvm.sum(
            data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k),
        name="C")

    a_buffer = tvm.decl_buffer(data.shape,
                               dtype='uint8',
                               name="a_buffer",
                               offset_factor=1,
                               strides=[1])
    b_buffer = tvm.decl_buffer(kernel.shape,
                               dtype='int8',
                               name="b_buffer",
                               offset_factor=1,
                               strides=[tvm.var('ldw'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
            vec_ai32 = re_int32.astype('int32x16')
            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], "int8x64")
            vec_one = tvm.const(1, "int16x32")
            pair_reduction = tvm.call_llvm_intrin(
                'int16x32', 'llvm.x86.avx512.pmaddubs.w.512',
                tvm.const(0, 'uint32'), vec_a, vec_b)
            quad_reduction = tvm.call_llvm_intrin(
                'int32x16', 'llvm.x86.avx512.pmaddw.d.512',
                tvm.const(0, 'uint32'), pair_reduction, vec_one)
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(
                    0, quad_reduction + outs[0].vload([0], 'int32x16')))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(C.op,
                                      _intrin_func,
                                      binds={
                                          data: a_buffer,
                                          kernel: b_buffer
                                      })
Exemple #42
0
    def check_device():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')

        out_dtype = 'float32'
        wkl, sch_default = _spatial_get_sch(A, W, stride, padding, out_dtype)
        sch = sch_default if schedule is None else schedule

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)

        dtype = A.dtype

        @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()
        # device = 'llvm'
        device = 'llvm -mcpu=skylake-avx512'
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)

        with tvm.build_config(auto_unroll_max_step=1400,
                              unroll_explicit=(device != "cuda")):
            print('--- schedule data packing ---')
            A_vec, s = _spatial_pack_data_only(wkl, sch, A)
            print(A_vec.shape)
            a_vec_shape = get_const_tuple(A_vec.shape)
            a_vec = tvm.nd.array(np.zeros(a_vec_shape, dtype=dtype), ctx)
            print(tvm.lower(s, [A, A_vec], simple_mode=True))
            func = tvm.build(s, [A, A_vec], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a, a_vec).mean
            print('data -> data_vec: %g secs/op' % cost)

            print('--- schedule kernel packing ---')
            W_vec, s = _spatial_pack_kernel_only(wkl, sch, W)
            print(W_vec.shape)
            w_vec_shape = get_const_tuple(W_vec.shape)
            w_vec = tvm.nd.array(np.zeros(w_vec_shape, dtype=dtype), ctx)
            # print(tvm.lower(s, [W, W_vec], simple_mode=True))
            func = tvm.build(s, [W, W_vec], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(w, w_vec).mean
            print('kernel -> kernel_vec: %g secs/op' % cost)

            print('--- schedule conv & unpack ---')
            A_vec = tvm.placeholder(a_vec_shape, name='A_vec')
            W_vec = tvm.placeholder(w_vec_shape, name='W_vec')
            B, s = _spatial_conv_only(wkl, sch, A_vec, W_vec, out_dtype=dtype)
            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                             ctx)
            # print(tvm.lower(s, [A_vec, W_vec, B], simple_mode=True))
            func = tvm.build(s, [A_vec, W_vec, B], target=device)
            func.save('conv_unpack.asm')
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a_vec, w_vec, b).mean
            print('conv & unpack: %g secs/op' % cost)

            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            print(b_np.shape)
Exemple #43
0
def test_gemm():
    # graph
    nn = 2048
    n = tvm.var('n')
    n = tvm.convert(nn)
    m, l = n, n
    A = tvm.placeholder((l, n), name='A')
    B = tvm.placeholder((l, m), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((m, n),
                    lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k),
                    name='C')

    # schedule
    s = tvm.create_schedule(C.op)
    AA = s.cache_read(A, "shared", [C])
    BB = s.cache_read(B, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BL = s.cache_read(BB, "local", [C])
    CC = s.cache_write(C, "local")

    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")

    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_thread)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    ko, ki = s[CC].split(k, factor=8)
    kt, ki = s[CC].split(ki, factor=1)
    s[CC].reorder(ko, kt, ki, yo, xo)
    s[AA].compute_at(s[CC], ko)
    s[BB].compute_at(s[CC], ko)
    s[CC].unroll(kt)
    s[AL].compute_at(s[CC], kt)
    s[BL].compute_at(s[CC], kt)
    # Schedule for A's shared memory load
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)
    s[AA].vectorize(xi)
    # Schedule for B' shared memory load
    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)
    s[BB].vectorize(xi)
    s[AA].double_buffer()
    s[BB].double_buffer()

    # correctness
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Device %s" % device)
        f = tvm.build(s, [A, B, C], device)
        # launch the kernel.
        n, m, l = nn, nn, nn
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        for i in range(2):
            f(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.dot(b_np.T, a_np),
                                    rtol=1e-5)

        num_flops = 2 * nn * nn * nn
        num_runs = 10
        timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
        t = timer_f(a, b, c).mean
        GFLOPS = num_flops / (t * 1e3) / 1e6
        print("average time cost of %d runs = %g ms, %g GFLOPS." %
              (num_runs, t * 1e3, GFLOPS))

    for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]:
        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=(device != "cuda")):
            check_device(device)
Exemple #44
0
def _intrin_popcount(m, k_i, w_b, x_b, unipolar):
    pack_dtype = 'uint8'
    w = tvm.placeholder((w_b, m, k_i), dtype=pack_dtype, name='w')
    x = tvm.placeholder((
        x_b,
        k_i,
    ), dtype=pack_dtype, name='x')
    k = tvm.reduce_axis((0, k_i), name='k')
    bw = tvm.reduce_axis((0, w_b), name='bw')
    bx = tvm.reduce_axis((0, x_b), name='bx')
    if unipolar:
        dtype = 'int16'
        z = tvm.compute(
            (m, ),
            lambda i: tvm.sum((tvm.popcount(w[bw, i, k].astype(dtype) & x[
                bx, k].astype(dtype)) - tvm.popcount(~w[bw, i, k].astype(
                    dtype) & x[bx, k].astype(dtype))) <<
                              (bw + bx).astype(dtype),
                              axis=[bw, bx, k]),
            name='z')
    else:
        dtype = 'uint16'
        z = tvm.compute((m, ),
                        lambda i: tvm.sum(tvm.popcount(w[bw, i, k].astype(
                            dtype) & x[bx, k].astype(dtype)) <<
                                          (bw + bx).astype(dtype),
                                          axis=[bw, bx, k]),
                        name='z')
    Wb = tvm.decl_buffer(w.shape,
                         w.dtype,
                         name="W",
                         offset_factor=k_i,
                         strides=[tvm.var('ldw'),
                                  tvm.var('ldw'), 1])  # stride can be inferred
    Xb = tvm.decl_buffer(x.shape,
                         x.dtype,
                         name="X",
                         offset_factor=k_i,
                         strides=[tvm.var('ldw'), 1])
    Zb = tvm.decl_buffer(z.shape,
                         z.dtype,
                         name="Z",
                         offset_factor=1,
                         strides=[1])

    def _intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]

        args_1 = tvm.const(1, 'uint32')
        args_2 = tvm.const(2, 'uint32')

        if unipolar:
            vpadd = "llvm.arm.neon.vpadd.v8i8"
            vpadalu = "llvm.arm.neon.vpadals.v16i8.v8i16"
            full_dtype = 'int8x16'
            half_dtype = 'int8x8'
            return_dtype = 'int16x8'
        else:
            vpadd = "llvm.arm.neon.vpadd.v8u8"
            vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
            full_dtype = 'uint8x16'
            half_dtype = 'uint8x8'
            return_dtype = 'uint16x8'

        def _instr(index):
            irb = tvm.ir_builder.create()
            if index == 1:  # reduce reset
                irb.emit(zz.vstore(0, tvm.const(0, return_dtype)))
                return irb.get()
            # body and reduce update
            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            w_ = ww.vload([bw, i, 0],
                                          'uint8x16').astype(full_dtype)
                            x_ = xx.vload([bx, 0],
                                          'uint8x16').astype(full_dtype)
                            if unipolar:
                                cnts = tvm.popcount(w_
                                                    & x_) - tvm.popcount(~w_
                                                                         & x_)
                            else:
                                cnts = tvm.popcount(w_ & x_)
                            upper_half = tvm.call_pure_intrin(
                                half_dtype, 'vectorhigh', cnts)
                            lower_half = tvm.call_pure_intrin(
                                half_dtype, 'vectorlow', cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                half_dtype, vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                half_dtype, vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin(full_dtype,
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, pack_dtype)
                        out = tvm.call_llvm_intrin(return_dtype, vpadalu,
                                                   args_2,
                                                   zz.vload(0, return_dtype),
                                                   shifted_cnts)
                    else:  # ki == 8
                        for i in range(m):
                            w_ = ww.vload([bw, i, 0],
                                          'uint8x8').astype(half_dtype)
                            x_ = xx.vload([bx, 0],
                                          'uint8x8').astype(half_dtype)
                            if unipolar:
                                cnts8[i] = tvm.popcount(
                                    w_ & x_) - tvm.popcount(~w_ & x_)
                            else:
                                cnts8[i] = tvm.popcount(w_ & x_)
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                half_dtype, vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                half_dtype, vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin(full_dtype,
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, pack_dtype)
                        out = tvm.call_llvm_intrin(return_dtype, vpadalu,
                                                   args_2,
                                                   zz.vload(0, return_dtype),
                                                   shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(z.op,
                                      _intrin_func,
                                      binds={
                                          w: Wb,
                                          x: Xb,
                                          z: Zb
                                      })
Exemple #45
0
def test_gemm():
    # graph
    nn = 2048
    n = tvm.var('n')
    n = tvm.convert(nn)
    m, l = n, n
    A = tvm.placeholder((l, n), name='A')
    B = tvm.placeholder((l, m), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((m, n),
                    lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k),
                    name='C')

    # schedule
    s = tvm.create_schedule(C.op)
    AA = s.cache_read(A, "shared", [C])
    BB = s.cache_read(B, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BL = s.cache_read(BB, "local", [C])
    CC = s.cache_write(C, "local")

    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")

    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_thread)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    ko, ki = s[CC].split(k, factor=8)
    kt, ki = s[CC].split(ki, factor=1)
    s[CC].reorder(ko, kt, ki, yo, xo)
    s[AA].compute_at(s[CC], ko)
    s[BB].compute_at(s[CC], ko)
    s[AL].compute_at(s[CC], kt)
    s[BL].compute_at(s[CC], kt)
    # Schedule for A's shared memory load
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)
    s[AA].vectorize(xi)
    # Schedule for B' shared memory load
    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)
    s[BB].vectorize(xi)

    # correctness
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        f = tvm.build(s, [A, B, C], device)
        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
        # launch the kernel.
        n, m, l = nn, nn, nn
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        for i in range(2):
            f(a, b, c)
        np.testing.assert_allclose(c.asnumpy(),
                                   np.dot(b_np.T, a_np),
                                   rtol=1e-5)

    with tvm.build_config(auto_unroll_max_step=32,
                          auto_unroll_min_depth=0,
                          unroll_explicit=False):
        check_device("cuda")
def test_depthwise_conv2d_nchw():
    """You may test different settings."""
    batch = 1
    in_channel = 256
    in_height = 96
    in_width = 96

    filter_channel = in_channel
    channel_multiplier = 1
    filter_height = 3
    filter_width = 3

    stride_h = 1
    stride_w = 1

    padding = 'SAME'  # or 'VALID'

    # Placeholder
    Input = tvm.placeholder((batch, in_channel, in_height, in_width),
                            name='Input')
    Filter = tvm.placeholder(
        (filter_channel, channel_multiplier, filter_height, filter_width),
        name='Filter')
    Stride = [stride_h, stride_w]
    Scale = tvm.placeholder((in_channel * channel_multiplier, ), name='Scale')
    Shift = tvm.placeholder((in_channel * channel_multiplier, ), name='Shift')
    # Declare
    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, Stride,
                                                    padding)
    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
    Relu = topi.nn.relu(ScaleShift)
    # Schedule
    s1 = schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
    s2 = schedule_depthwise_conv2d_nchw(ScaleShift)
    s3 = schedule_depthwise_conv2d_nchw(Relu)
    input_np = np.random.uniform(size=get_const_tuple(Input.shape)).astype(
        Input.dtype)
    filter_np = np.random.uniform(size=get_const_tuple(Filter.shape)).astype(
        Filter.dtype)
    scale_np = np.random.uniform(size=(in_channel *
                                       channel_multiplier)).astype(Scale.dtype)
    shift_np = np.random.uniform(size=(in_channel *
                                       channel_multiplier)).astype(Shift.dtype)

    def check_device(device):
        if not tvm.runtime.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.context(device, 0)
        # Build the kernel
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
        # Prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)

        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(ScaleShift.shape),
                     dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # Measure time cost of kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          scale_shift_tvm).mean
        # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          relu_tvm).mean
        print("Input shape = " + str(get_const_tuple(Input.shape)))
        print("Filter shape = " + str(get_const_tuple(Filter.shape)))
        print("Stride = (%d, %d)" % (stride_h, stride_w))
        print("padding = %s\n" % padding)
        print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape)))
        print("average time cost of 1000 runs (depthwise_conv2d) = %g us" %
              (tcost_1 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us"
            % (tcost_2 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us"
            % (tcost_3 * 1e6))
        # correctness
        depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
            input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
        scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape))
        for c in range(in_channel * channel_multiplier):
            scale_shift_scipy[:,
                              c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[
                                  c] + shift_np[c]
        relu_scipy = np.maximum(scale_shift_scipy, 0)
        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                    depthwise_conv2d_scipy,
                                    rtol=1e-5)
        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(),
                                    scale_shift_scipy,
                                    rtol=1e-5)
        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print("success")

    for device in ['cuda', 'opencl', 'rocm']:
        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=device == 'rocm',
                              detect_global_barrier=False,
                              restricted_func=True):
            check_device(device)
    print(stmt)
    # build and invoke the kernel.
    f = tvm.build(s, [A, scale, D], "llvm")
    ctx = tvm.cpu(0)
    # launch the kernel.
    a = tvm.nd.array(np.random.randint(0, 2, size=(n, )).astype(A.dtype), ctx)
    sc = tvm.nd.array(
        np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
    d = tvm.nd.empty((), D.dtype, ctx)
    f(a, sc, d)
    d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
    tvm.testing.assert_allclose(d.asnumpy(), d_np)


if __name__ == "__main__":
    with tvm.build_config(instrument_bound_checkers=True):
        # zero scale
        test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm(
        )
        # in bound
        test_in_bounds_llvm()
        # upper bound
        test_out_of_bounds_llvm(1, 0)
        test_out_of_bounds_llvm(0, 1)
        test_out_of_bounds_llvm(1, 1)
        test_out_of_bounds_llvm(10000, 0)
        test_out_of_bounds_llvm(0, 10000)
        test_out_of_bounds_llvm(10000, 10000)
        # lower bound
        test_out_of_bounds_llvm(-1, 0)
        test_out_of_bounds_llvm(0, -1)
Exemple #48
0
    sch[conv].unroll(rwi)

    in_cache = sch.cache_read(image, 'global', [conv])
    sch[in_cache].compute_at(sch[conv], w)
    axis = sch[in_cache].fuse(in_cache.op.axis[3], in_cache.op.axis[4])
    sch[in_cache].vectorize(axis)

    #sch[conv].parallel(h)
    sch[conv].reorder(n, c0, h, rh, c1o, rco, rwo, w, rwi, c1i, rci)
    sch[conv].pragma(c1i, 'vnni')

    print(tvm.lower(sch, [image, kernel, conv], simple_mode=True))
    answer_ref = tvm.build(sch, [image, kernel, conv])

    import vnni
    with tvm.build_config(add_lower_pass=[(1, vnni.vnni_transformation)]):
        print(tvm.lower(sch, [image, kernel, conv], simple_mode=True))
        module = tvm.build(sch, [image, kernel, conv],
                           target='llvm -mcpu=cascadelake')

        shapes = [i.shape for i in [image, kernel]]
        shapes = [list(map(lambda x: x.value, i)) for i in shapes]
        out_shape = list(map(lambda x: x.value, conv.shape))
        types = ['int8', 'int8', 'int32']
        args = [
            tvm.ndarray.array(np.random.randint(0, 127, i, j))
            for i, j in zip(shapes, types)
        ]
        out = tvm.ndarray.array(np.zeros(out_shape).astype('int32'))
        ans = tvm.ndarray.array(np.zeros(out_shape).astype('int32'))
Exemple #49
0
def dp4a(x_scope='local', y_scope='local', z_scope='local'):
    """
    Int8 dot product reduced by every 4 elements using __dp4a

    Parameters
    ----------
    x_scope : str, optional
        The storage scope of buffer for lhs
    y_scope : str, optional
        The storage scope of buffer for rhs
    z_scope : str, optional
        The storage scope of buffer for result

    Returns
    -------
    intrin : TensorIntrin
        The dp4a TensorIntrin that can be used in tensorizing schedule.
    """

    n = 4  # dp4a requires operands packed by 4
    x = tvm.placeholder((n, ), name='x', dtype='int8')
    y = tvm.placeholder((n, ), name='y', dtype='int8')

    k = tvm.reduce_axis((0, n), name='rc')

    z = tvm.compute(
        (1, ), lambda i: tvm.sum(x[k].astype('int32') * y[k].astype('int32'),
                                 axis=[k]))

    def _intrin_func(ins, outs):
        def _instr(index):
            xx, yy = ins
            zz = outs[0]

            if index == 1:
                return zz.vstore(0, 0)

            ib = tvm.ir_builder.create()

            vec_x = xx.vload(0, dtype='int8x4')
            vec_y = yy.vload(0, dtype='int8x4')
            prev_z = 0 if index == 0 else zz.vload(0)

            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y,
                                         prev_z)
            ib.emit(zz.vstore(0, new_z))

            return ib.get()

        return _instr(0), _instr(1), _instr(2)  # body, reset, update

    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
        scopes = {x: x_scope, y: y_scope, z: z_scope}
        binds = {
            t: tvm.decl_buffer(t.shape,
                               t.dtype,
                               t.op.name,
                               data_alignment=cfg.data_alignment,
                               offset_factor=cfg.offset_factor,
                               scope=scopes[t])
            for t in [x, y, z]
        }

        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
Exemple #50
0
        tvm.make.For(j, 0, 8, 3, 0,
                     tvm.make.Store(Ab.data,
                                    tvm.make.Load(dtype, Ab.data, i) + 1,
                                    j + 1)))
    assert isinstance(stmt, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True)
    assert not isinstance(ret, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True)
    assert isinstance(ret, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False)
    assert isinstance(ret, tvm.stmt.For)
    assert ret.for_type == tvm.stmt.For.Unrolled


if __name__ == "__main__":
    with tvm.build_config(dump_pass_ir=True):
        test_unroll_loop()

    def end_with(*suffix):
        ends = suffix
        def run(s):
            f = map(s.endswith, ends)
            if True in f: return s
        return run

    file_list = os.listdir('./')
    cc_file = end_with('.cc')
    cc_file = filter(cc_file, file_list)
    assert len(cc_file) == 3
    for i in cc_file:
        os.remove(i)
Exemple #51
0
from __future__ import absolute_import, print_function
import tvm
import numpy as np

tgt_host = "llvm"
# tgt="llvm"
tgt = "c"

n = tvm.var("n")
A = tvm.placeholder((n, ), name='A')
B = tvm.placeholder((n, ), name='B')
C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
print(type(C))
s = tvm.create_schedule(C.op)
bx, tx = s[C].split(C.op.axis[0], factor=64)

with tvm.build_config(dump_pass_ir=True):
    fadd = tvm.build(s, [A, B, C], tgt, name="myadd")

print(fadd.get_source())

print("finished.")
Exemple #52
0
def intrinsic_gemm(i, j, k, il, jl, kl, ic, jc, kc):
    """
    (i, k) * (k, j)
    i, j, k: normal iteration size
    il, jl, kl: last iteration size
    ic, jc, kc: last iteration condition
    """
    assert i * k + k * j <= 256 * 1024, 'input too large for scratchpad'
    assert 4 * (i * j) <= 64 * 1024, 'input too large for accumulator'

    a = tvm.placeholder((i, k), name='a', dtype=dtype)
    b = tvm.placeholder((k, j), name='b', dtype=dtype)
    kk = tvm.reduce_axis((0, k), name='k')
    c = tvm.compute((i, j),
                    lambda ii, jj: tvm.sum(a[ii, kk] * b[kk, jj], axis=kk),
                    name='c')

    strideA = tvm.var("sA")
    Ab = tvm.decl_buffer(a.shape,
                         a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[strideA, 1])
    strideB = tvm.var("sB")
    Bb = tvm.decl_buffer(b.shape,
                         b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[strideB, 1])
    strideC = tvm.var("sC")
    Cb = tvm.decl_buffer(c.shape,
                         c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[strideC, 1])

    II = i // DIM + (0 if i % DIM == 0 else 1)
    JJ = j // DIM + (0 if j % DIM == 0 else 1)
    KK = k // DIM + (0 if k % DIM == 0 else 1)
    pad_I = 0 if i % DIM == 0 else (DIM - i % DIM)
    pad_J = 0 if j % DIM == 0 else (DIM - j % DIM)
    pad_K = 0 if k % DIM == 0 else (DIM - k % DIM)

    IIl = il // DIM + (0 if il % DIM == 0 else 1)
    JJl = jl // DIM + (0 if jl % DIM == 0 else 1)
    KKl = kl // DIM + (0 if kl % DIM == 0 else 1)
    pad_Il = 0 if il % DIM == 0 else (DIM - il % DIM)
    pad_Jl = 0 if jl % DIM == 0 else (DIM - jl % DIM)
    pad_Kl = 0 if kl % DIM == 0 else (DIM - kl % DIM)

    II = tvm.if_then_else(ic, IIl, II)
    JJ = tvm.if_then_else(jc, JJl, JJ)
    KK = tvm.if_then_else(kc, KKl, KK)
    pad_I = tvm.if_then_else(ic, pad_Il, pad_I)
    pad_J = tvm.if_then_else(jc, pad_Jl, pad_J)
    pad_K = tvm.if_then_else(kc, pad_Kl, pad_K)

    # reset-update-finalize
    def intrin_func(ins, outs):
        aa, bb = ins
        cc, = outs

        def _body():
            ib = tvm.ir_builder.create()
            # int32_t matmul_kernel(const elem_t *A, const elem_t *B, const acc_t *D,
            #          elem_t *C, int32_t I, int32_t J, int32_t K, int32_t pad_I,
            #          int32_t pad_J, int32_t pad_K, int32_t A_row_len,
            #          int32_t B_row_len, int32_t D_row_len, int32_t C_row_len,
            #          bool no_bias, bool repeating_bias);
            # D is set to a dummy address 1 to determine whether to overwrite
            # accumulator contents: on the first run, 1 will be retained and
            # overwrite the value in the accumulator; on subsequent runs D will be
            # replaced by NULL and C will accumulate on top of the accumulator's contents
            # This is controlled via bit 1 << (ADDR_LEN - 2) - see kernel source
            ib.emit(
                tvm.call_extern("int32", "matmul_kernel", aa.access_ptr("r"),
                                bb.access_ptr("r"), 1, cc.access_ptr("rw"), II,
                                JJ, KK, pad_I, pad_J, pad_K, strideA, strideB,
                                0, strideC, True, False))
            return ib.get()

        def _reset():
            ib = tvm.ir_builder.create()
            # int32_t matmul_reset(elem_t *C, int32_t I, int32_t J, int32_t pad_I,
            #         int32_t pad_J, int32_t C_row_len);
            ib.emit(
                tvm.call_extern("int32", "matmul_reset", cc.access_ptr("w"),
                                II, JJ, pad_I, pad_J, strideC))
            return ib.get()

        def _finalize():
            ib = tvm.ir_builder.create()
            # Move out C from accumulator
            # int32_t matmul_finalize(elem_t *C, int32_t I, int32_t J, int32_t pad_I,
            #         int32_t pad_J, int32_t C_row_len);
            ib.emit(
                tvm.call_extern("int32", "matmul_finalize",
                                cc.access_ptr("rw"), II, JJ, pad_I, pad_J,
                                strideC))
            return ib.get()

        # standalone (without reduce axis split), reset, update
        return None, _reset(), _body(), _finalize()

    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op,
                                      intrin_func,
                                      binds={
                                          a: Ab,
                                          b: Bb,
                                          c: Cb
                                      },
                                      name="sp_gemm")
Exemple #53
0
 po, co, pi, ci = s[tiled_buf].op.axis
 if (fc >= 8):
     outter, co = s[tiled_buf].split(co, fc // 8)
 else:
     outter, co = s[tiled_buf].split(co, 1)
 c1, c2 = s[tiled_buf].split(outter, kw)
 ph, pwo = s[tiled_buf].split(po, pw // 8)
 s[tiled_buf].reorder(co, pwo, ph, pi, c1, c2, ci)
 s[tiled_buf].pragma(pwo, 'nnpu.im2col')
 # s[feature_buf].pragma(s[feature_buf].leaf_iter_vars[0], env.dma_copy_to_buf)
 # s[tiled].pragma(s[tiled].leaf_iter_vars[0], env.dma_copy_from_buf)
 # pw = s[tiled].fuse(pwo, pi)
 # s[tiled].reorder(co, ph, pw, c1, c2, ci)
 from nnpu import ir_pass
 pass_list = [(2, ir_pass.im2col_transform)]
 with tvm.build_config(add_lower_pass=pass_list):
     print(tvm.lower(s, [feature, tiled], simple_mode=True))
     func = tvm.build(s, [feature, tiled], 'llvm', 'llvm', 'im2col_func')
 a_np = np.random.randint(size=(fh, fw, fc),
                          dtype='int8',
                          low=-128,
                          high=127)
 a_nd = tvm.nd.array(a_np)
 gt_nd = tvm.nd.array(
     np.zeros((packed_shape[0] // 8, packed_shape[1] // 8, 8, 8),
              dtype='int8'))
 gt_func(a_nd, gt_nd)
 real_nd = tvm.nd.array(
     np.zeros((packed_shape[0] // 8, packed_shape[1] // 8, 8, 8),
              dtype='int8'))
 func(a_nd, real_nd)