def test_bound_tensor_compute_op():
    def intrin_test():
      m1 = tvm.var("m1")
      n1 = tvm.var("n1")
      a = tvm.placeholder((m1, n1), name='a')
      c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c')

      Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1)
      Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1)

      def intrin_func(ins, outs):
        aa = ins[0]
        cc = outs[0]
        def _body():
          ib = tvm.ir_builder.create()
          ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r")))
          return ib.get()
        return _body()
      with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})

    test_func = intrin_test()
    A = tvm.placeholder((20,20), name='A')
    B = tvm.compute(A.shape, lambda i,j : A[i,j], name='B')
    C = tvm.compute((10, 20), lambda i : test_func(B[i:10, 0:20]), name='C')
    s = tvm.create_schedule(C.op)
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    assert(bounds[B.op.axis[0]].extent.value == 10)
Example #2
0
def test_scan():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init")
    x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
    s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
    s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
    s_scan = tvm.scan(s_init, s_update, s_state)

    def test_getbody():
        body = tvm.schedule.ScanGetBody(s_scan.op)
        assert set(body) == set([s_scan.op, s_update.op, s_up1.op])

    def test_attach_path():
        s = tvm.create_schedule(s_scan.op)
        s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
        apath = tvm.schedule.CreateAttachPath(s)
        assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis]))
        assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis]))

    def test_fix_pt():
        body = tvm.schedule.ScanGetBody(s_scan.op)
        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
        assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
Example #3
0
def test_storage_sync():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')

    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')

    s = tvm.create_schedule(A2.op)
    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
    s[A2].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[A1].compute_at(s[A2], xo)
    s[A1].set_scope("shared")

    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
    f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True)
    flist = tvm.ir_pass.SplitHostDevice(f)
    f = flist[1]
    f = tvm.ir_pass.ThreadSync(f, "shared")
    body_list = tvm.make.stmt_list(f.body.body.body.body)
    assert(body_list[1].value.name == "tvm_storage_sync")
Example #4
0
def test_storage_share_gpu():
    m = tvm.var('m')
    A = [tvm.placeholder((m), name='A')]
    num_stage = 5
    for t in range(num_stage):
        A.append(tvm.compute((m,), lambda i: A[-1][i] + (t+1), name='A%d_s' % t))
        A.append(tvm.compute((m,), lambda i: A[-1][i], name='A%d' % t))
    s = tvm.create_schedule(A[-1].op)
    for t in range(num_stage):
        x = A[2*t+2].op.axis[0]
        bx, tx = s[A[2*t+2]].split(x, factor=32)
        s[A[2*t+2]].bind(bx, tvm.thread_axis("blockIdx.x"))
        s[A[2*t+2]].bind(tx, tvm.thread_axis("threadIdx.x"))
        s[A[2*t+1]].compute_at(s[A[2*t+2]], tx)
        s[A[2*t+1]].set_scope("shared")

    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A[0].shape, A[0].dtype, name='A')
    Bb = tvm.decl_buffer(A[0].shape, A[0].dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.StorageRewrite(stmt)
    alloc_stats = {"global": 0, "shared": 0}

    def verify(n):
        if isinstance(n, tvm.stmt.AttrStmt):
            if n.attr_key == "storage_scope":
                alloc_stats[n.value.value] += 1
    tvm.ir_pass.PostOrderVisit(stmt, verify)
    assert alloc_stats["global"] == 2
    assert alloc_stats["shared"] == num_stage
Example #5
0
def test_inplace_rule():
    m = 10
    A = tvm.placeholder((m,), name='A')
    A0 = tvm.compute((m,), lambda i: A[i], name='A0')
    A1 = tvm.compute((m,), lambda i: A[i] + 1, name='A1')
    AA =  tvm.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA')
    B = tvm.compute((m,), lambda i: AA[i] + 1, name='B')
    s = tvm.create_schedule(B.op)
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.StorageRewrite(stmt)
    # verify only have one allocations.
    # verify inplace folding works
    num_alloc = [0]
    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
    tvm.ir_pass.PostOrderVisit(stmt, verify)
    assert num_alloc[0] == 2
def my_clip(x, a_min, a_max):
    """Unlike topi's current clip, put min and max into two stages."""
    const_min = tvm.const(a_min, x.dtype)
    const_max = tvm.const(a_max, x.dtype)
    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
    return x
Example #7
0
def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024):
    #Test Buffer
    register_mem(scope_tb, max_bits)
    m = 10
    A = tvm.placeholder((m,), name='A')
    C = tvm.placeholder((m,), name='C')
    D = tvm.placeholder((m,), name='D')
    A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0')
    A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1')
    A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2')
    B = tvm.compute((m,), lambda i: A2[i], name='B')
    s = tvm.create_schedule(B.op)
    A0L = s.cache_read(A0, scope_tb, [A2])
    A1L = s.cache_read(A1, scope_tb, [A2])
    A2L = s.cache_read(A2, scope_tb, [B])
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    Cc = tvm.decl_buffer(C.shape, B.dtype, name='C')
    Dd = tvm.decl_buffer(D.shape, B.dtype, name='D')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.StorageRewrite(stmt)
    # verify only have one allocations.
    # verify inplace folding works
    num_alloc = [0]
    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
    tvm.ir_pass.PostOrderVisit(stmt, verify)
    assert num_alloc[0] == 2
Example #8
0
File: bnn.py Project: bddppq/tvm
def binary_dense(data, weight):
    """Binary matrix multiplication using xor and bit-count.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim], dtype is uint32.

    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim], dtype is uint32.

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim], dtype is float32.
    """
    assert data.dtype == 'uint32' and weight.dtype == 'uint32', \
        "dtype of data and weight should be uint32"
    assert len(data.shape) == 2 and len(weight.shape) == 2, \
        "only support 2-dim binary dense"
    batch, in_dim = data.shape
    out_dim, _ = weight.shape
    k = tvm.reduce_axis((0, in_dim), name='k')
    matmul = tvm.compute((batch, out_dim), lambda i, j: \
                          tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \
                          tag='binary_dense')

    return tvm.compute((batch, out_dim), lambda i, j: \
                        32 * in_dim - 2. * matmul(i, j), \
                        tag=tag.ELEMWISE)
def test_bound_nest_thread():
    m = tvm.var('m')
    A = tvm.placeholder((m), name='A')
    A1 = tvm.compute((m,), lambda i: A[i], name='A1')
    A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2')
    A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3')

    s = tvm.create_schedule(A3.op)
    s[A2].set_scope("shared")
    s[A1].set_scope("local")

    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    bx, tx = s[A3].split(A3.op.axis[0], factor=32)
    s[A3].bind(bx, block_x)
    s[A3].bind(tx, thread_x)
    s[A2].compute_at(s[A3], tx)
    _, xi = s[A2].split(A2.op.axis[0], nparts=1)
    s[A2].bind(xi, thread_x)
    s[A1].compute_at(s[A3], tx)
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    assert(bounds[A1.op.axis[0]].extent.value==1)
    assert(bounds[A2.op.axis[0]].extent.value==32)
    assert(bounds[A3.op.axis[0]].extent == m)
Example #10
0
def test_double_splitting_with_indivisible_factors():
    m = 48
    dtype="float32"
    A = tvm.placeholder((m,), name='A', dtype=dtype)
    C = tvm.compute((m,), lambda i: A[i], name='C')
    D = tvm.compute((m,), lambda i: C[i], name='D')

    s = tvm.create_schedule(D.op)
    co, ci = s[C].split(C.op.axis[0], factor=10)
    do, di = s[D].split(D.op.axis[0], 32)
    s[C].compute_at(s[D], do)

    target = 'llvm'
    with tvm.build_config(partition_const_loop=True):
        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
        func = tvm.build(f, target=target)

    # Find the beginning of the Halide IR corresponding to kernel code
    # and make sure it doesn't have an if statements left
    top_produce = find_top_produce(f.body)
    assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse))))

    # check functional correctness of generated code
    ctx = tvm.context(target, 0)
    a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx)
    c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    func(a, c, d)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
    tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
Example #11
0
def test_llvm_persist_parallel():
    n = 128
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
    C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=8)
    xo1, xo2 = s[C].split(xo, nparts=1)
    s[B].compute_at(s[C], xo1)
    s[B].parallel(s[B].op.axis[0])
    s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish")
    s[C].parallel(xi)
    s[C].pragma(xo1, "parallel_launch_point")
    s[C].pragma(xi, "parallel_stride_pattern")

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                   np.sqrt(a.asnumpy() + 1) * 2 + 2,
                                   rtol=1e-5)

    check_llvm()
Example #12
0
File: dense.py Project: bddppq/tvm
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
    if out_dtype is None:
        out_dtype = data.dtype
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)
    # create tuning space
    cfg.define_split("tile_x", out_dim, num_outputs=2)
    cfg.define_split("tile_y", batch, num_outputs=2)
    cfg.define_split("tile_k", in_dim, num_outputs=2)
    if cfg.is_fallback:
        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)

    vec = cfg["tile_k"].size[-1]
    k = tvm.reduce_axis((0, in_dim // vec), "k")
    CC = tvm.compute((batch, out_dim, vec),
                     lambda z, y, x: tvm.sum(
                         data[z, k * vec + x].astype(out_dtype) *
                         weight[y, k * vec + x].astype(out_dtype), axis=k))

    kk = tvm.reduce_axis((0, vec), "kk")
    C = tvm.compute((batch, out_dim),
                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
                    tag="dense_nopack")
    if bias is not None:
        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)

    return C
Example #13
0
File: dense.py Project: bddppq/tvm
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
    if out_dtype is None:
        out_dtype = data.dtype
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)
    # create tuning space
    cfg.define_split("tile_y", batch, num_outputs=3)
    cfg.define_split("tile_x", out_dim, num_outputs=3)
    cfg.define_split("tile_k", in_dim, num_outputs=2)
    if cfg.is_fallback:
        _default_dense_pack_config(cfg, batch, out_dim, in_dim)

    packw_bn = cfg["tile_x"].size[-1]
    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
    packw = tvm.compute(packw_shape,
                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")

    k = tvm.reduce_axis((0, in_dim), name="k")
    C = tvm.compute((batch, out_dim),
                    lambda y, x: tvm.sum(
                        data[y, k].astype(out_dtype) *
                        packw[x // packw_bn, k, x % packw_bn].astype(out_dtype),
                        axis=k),
                    tag="dense_pack")
    if bias is not None:
        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)
    return C
Example #14
0
def dense_default(data, weight, bias=None):
    """The default implementation of dense in topi.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim]

    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim]

    bias : tvm.Tensor, optional
        1-D with shape [out_dim]

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim]
    """
    assert len(data.shape) == 2 and len(weight.shape) == 2, \
        "only support 2-dim dense"
    if bias is not None:
        assert len(bias.shape) == 1
    batch, in_dim = data.shape
    out_dim, _ = weight.shape
    k = tvm.reduce_axis((0, in_dim), name='k')
    matmul = tvm.compute((batch, out_dim), \
                         lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
                         tag='dense')
    if bias is not None:
        matmul = tvm.compute((batch, out_dim), \
                             lambda i, j: matmul[i, j] + bias[j], \
                             tag=tag.BROADCAST)
    return matmul
Example #15
0
def test_multiple_kernels():
    N = 1024

    A = tvm.placeholder((N, N), name='A')
    B = tvm.compute((N, N), lambda i, j: A[i, j])
    C = tvm.compute((N, N), lambda i, j: B[i, j])

    s = tvm.create_schedule([C.op])

    s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x"))
    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))

    # shared memory usage: 0
    # thread usage: N

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N - 1))]}):
            tvm.build(s, [A, C], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N))]}):
            tvm.build(s, [A, C], target)
        assert valid[0]
Example #16
0
def test_in_bounds_vectorize_llvm():
    n = 512
    lanes = 2
    A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
    B = tvm.compute((n,), lambda i: A[i], name='B')
    C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], nparts=2)
    _, xi = s[C].split(xi, factor=2)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    s[B].compute_at(s[C], xo)
    xo, xi = s[B].split(B.op.axis[0], factor=2)
    s[B].vectorize(xi)
    # build and invoke the kernel.
    lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False)
    print (lowered_func.body)
    f = tvm.build(s, [A, C], "llvm")
    ctx = tvm.cpu(0)
    # launch the kernel.
    a = tvm.nd.empty((n,), A.dtype).copyfrom(
        np.random.uniform(size=(n, lanes)))
    c = tvm.nd.empty((n,), C.dtype, ctx)
    f(a, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def test_copy_pad_split():
    m = 4 * 3
    A = tvm.placeholder((m, ), name="A")
    Apad = tvm.compute((m + 2,), lambda i:
                       tvm.select(tvm.all(i >= 1, i <= m),
                                  A[i - 1], 0.0), "Apad")
    B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=4)
    s[Apad].compute_at(s[B], xo)
    s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    def cb(src, dst, pad_before, pad_after, pad_value):
        assert(dst.elem_offset.value == 0)
        assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1)

        rpad_before = tvm.max(1 - xo * 4, 0)
        rpad_after = tvm.max(xo * 4 - 7, 0)
        assert_expr_equal(pad_before[0], rpad_before)
        assert_expr_equal(pad_after[0], rpad_after)
        assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
        return tvm.make.Evaluate(0)
    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
Example #18
0
def test_schedule_create():
    m = tvm.var('m')
    n = tvm.var('n')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.placeholder((n, l), name='B')
    AA = tvm.compute((m, l), lambda i, j: A[i, j])
    T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
    s = tvm.create_schedule(T.op)
    s[AA].set_scope("shared")
    xo, xi = s[T].split(T.op.axis[0], factor=10)
    xi1, xi2 = s[T].split(xi, factor=2)
    s[AA].compute_at(s[T], xi1)
    xo, xi = s[AA].split(AA.op.axis[0], factor=10)
    s[T].reorder(xi2, xi1)
    assert T.op.axis[1] in s[T].leaf_iter_vars

    # save load json
    json_str = tvm.save_json(s)
    s_loaded = tvm.load_json(json_str)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))

    # pickle unpickle
    dump = pkl.dumps(s)
    s_loaded = pkl.loads(dump)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
Example #19
0
File: pooling.py Project: gwli/tvm
def global_pool(data, pool_type):
    """Perform global pooling on the data

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, channel, in_height, in_width]

    pool_type : str
        Pool type, 'max' or 'avg'

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, channel, 1, 1]
    """
    assert len(data.shape) == 4, "only support 4-dim pooling"
    batch, channel, height, width = data.shape

    dheight = tvm.reduce_axis((0, height))
    dwidth = tvm.reduce_axis((0, width))

    if pool_type == 'max':
        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
                            tag="global_pool_max")
    elif pool_type == 'avg':
        tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
                            tag="global_pool_sum")
        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \
                            tag=tag.ELEMWISE)
    else:
        raise ValueError("Pool type should be 'avg' or 'max'.")
Example #20
0
def test_scan_group():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i])

    s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
    s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1)
    s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1)
    res = tvm.scan(s_init, s_update3, s_state, inputs=x)

    s = tvm.create_schedule(res.op)
    assert s[s_update1].group is not None
    assert s[s_update2].group == s[s_update1].group
    # Assign within group, is valid
    s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
    # create a new group, for [s_update2 and s_update1]
    g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
    assert g2.group is not None
    assert g2.group == s[s_update3].group
    assert s[s_update2].group == g2
    assert s[s_update1].group == g2
    g2.compute_at(s[s_update3], s_update3.op.axis[1])
    assert g2.attach_stage == s[s_update3]
    try:
        # compute outside group error.
        s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
        assert False
    except tvm.TVMError:
        pass
Example #21
0
File: conv2d.py Project: gwli/tvm
def _spatial_pack(data, kernel, stride, padding, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    HCAT, WCAT = KH-1, KW-1

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    TH = H + 2*HPAD
    TW = W + 2*WPAD
    OH = (H + 2*HPAD - KH) // HSTR + 1
    OW = (W + 2*WPAD - KW) // WSTR + 1

    dshape = (1, CI, H, W)
    dpshape = (1, CI, TH, TW)
    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)

    kshape = (CO, CI, KH, KW)
    kvshape = (CO/VC, CI, KH, KW, VC)

    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
        data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')

    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
        kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) *
                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
                axis=[ci, dh, dw]), name='conv')

    output = tvm.compute(oshape, lambda n, co, h, w:
                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
                         name='output_unpack', tag='spatial_conv_output')

    return output
Example #22
0
def test_tensor_reduce_multi_axis():
    m = tvm.var('m')
    n = tvm.var('n')
    A = tvm.placeholder((m, n), name='A')
    k1 = tvm.reduce_axis((0, n), "k")
    k2 = tvm.reduce_axis((0, m), "k")
    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2)))
    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
Example #23
0
 def test_scan3_not_exact_reach():
     s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, i, j], name="h1")
     s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, 10] * 2, name="h1")
     s_update = tvm.compute((l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
     s_scan = tvm.scan(s_init, s_update, s_state)
     body = tvm.schedule.ScanGetBody(s_scan.op)
     fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op)
     assert(fxpt[s_scan.op.spatial_axis_[0]].value == 1)
     assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
Example #24
0
 def test_scan4_reach_other():
     s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, j, j], name="h1")
     s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, j] * 2, name="h1")
     s_update = tvm.compute((l, m, n),
                            lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
     s_scan = tvm.scan(s_init, s_update, s_state)
     fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op)
     assert(fxpt[s_scan.op.spatial_axis_[0]].value == 0)
     assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
Example #25
0
def test_tensor_scan():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.placeholder((m, n))
    s = tvm.placeholder((m, n))
    res = tvm.scan(tvm.compute((1, n), lambda _, i: x[0, i]),
                   tvm.compute((m, n), lambda t, i: s[t-1, i] + x[t, i]),
                   s)
    assert tuple(res.shape) == (m, n)
Example #26
0
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
    sch = _get_schedule(wkl, "NHWC")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc

    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
    N, H, W, IB, CI = data_q.shape
    OCO, KH, KW, KB, VC, _ = kernel_vec.shape

    CO = OCO * VC
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH-1, KW-1

    PAD_H = H + 2*HPAD
    PAD_W = W + 2*WPAD
    OH = (H + 2*HPAD - KH) // HSTR + 1
    OW = (W + 2*WPAD - KW) // WSTR + 1
    dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI)
    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
    oshape = (1, OH, OW, CO)

    if (HPAD != 0 and WPAD != 0):
        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
    else:
        data_pad = data_q

    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')
    ib = tvm.reduce_axis((0, IB), name='ib')
    kb = tvm.reduce_axis((0, KB), name='kb')

    def _conv(n, h, w, co, vh, vw, vc):
        return tvm.sum((tvm.popcount(
            kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') &
            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16'))
                        << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])

    conv = tvm.compute(ovshape, _conv, name='conv')

    return tvm.compute(oshape, lambda n, h, w, co:
                       conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
                       name='output_vec', tag='spatial_bitserial_conv_nhwc')
def test_schedule_bound_condition():
   A = tvm.placeholder((64,), name='A', dtype="float32")
   Apad = tvm.compute((66,), lambda i: tvm.select(tvm.all(i>0, i < 65), A[i-1], tvm.const(0.)), name='Apad')
   Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2')
   s = tvm.create_schedule(Apad2.op)
   AL1 = s.cache_read(A,"local",[Apad])
   s = s.normalize()
   bounds = tvm.schedule.InferBound(s)
   stmt = tvm.schedule.ScheduleOps(s, bounds)
   stmt = tvm.ir_pass.Simplify(stmt)
   assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
def test_bound_conv1d():
    n = tvm.var('n')
    A = tvm.compute((n+2), lambda i: 1,  name='A')
    def computeB(ii):
        i = ii + 1
        return A[i-1] + A[i] + A[i+1]
    B = tvm.compute(n, computeB, name='B')
    s = tvm.create_schedule(B.op)
    s[A].compute_at(s[B], B.op.axis[0])
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    assert(bounds[A.op.axis[0]].extent.value == 3)
Example #29
0
def test_replace_dataflow():
    shape = (255,)
    A = tvm.placeholder(shape, name = "A")
    B = tvm.compute(shape, lambda i: A[i] + A[i], name = "B")
    C = tvm.compute(shape, lambda i: A[i] + B[i], name = "C")
    D = tvm.compute(shape, lambda i: A[i] + C[i], name = "D")
    E = tvm.compute(shape, lambda i: A[i] + D[i], name = "E")

    s = tvm.create_schedule(E.op)
    s.cache_read(A, "local", [B, C, D, E])
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
def test_bound1():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')

    s = tvm.create_schedule([A2.op])
    xo, xi = s[A2].split(s[A2].op.axis[0], 8)
    s[A1].compute_at(s[A2], xo)
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    assert(bounds[A1.op.axis[0]].extent.value == 8)
Example #31
0
def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, num_tile):
    assert layout == "NCHW", "Only support NCHW"
    # create workload according to raw arguments
    out_dtype = out_dtype or data.dtype
    N, CI, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        pre_packed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:  # kernel tensor is pre packed
        pre_packed = True
        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
        CO = CO * VC

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])

    # ==================== define configuration space ====================
    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    if num_tile == 2:     # for arm cpu
        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
    elif num_tile == 3:   # for mali gpu
        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
    else:
        raise RuntimeError("Invalid num_tile")

    cfg.define_reorder("reorder_0",
                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
                       policy='candidate', candidate=[
                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])

    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')

    # fallback support
    if cfg.is_fallback:
        if num_tile == 2:     # arm cpu
            ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'conv2d', 'direct')
            cfg.fallback_with_reference_log(ref_log)
        elif num_tile == 3:  # mali gpu
            ref_log = autotvm.tophub.load_reference_log('mali', 'rk3399', 'conv2d', 'direct')
            cfg.fallback_with_reference_log(ref_log)
    # ====================================================================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    kvshape = (CO // VC, CI, KH, KW, VC)
    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, CO, OH, OW)

    if dilation_h != 1 or dilation_w != 1:
        # undilate input data
        dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw:
                               data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h]
                               [(w*VW+vw)*WSTR+kw*dilation_w],
                               name='data_vec_undilated')
    else:
        dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
                               data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
                               name='data_vec')

    if pre_packed:
        kernel_vec = kernel
    else:
        kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
                                 kernel[co*VC+vc][ci][kh][kw],
                                 name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    kh = tvm.reduce_axis((0, KH), name='kh')
    kw = tvm.reduce_axis((0, KW), name='kw')

    if dilation_h != 1 or dilation_w != 1:
        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
            tvm.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
                    axis=[ci, kh, kw]), name='conv')
    else:
        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
            tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
                    axis=[ci, kh, kw]), name='conv')

    output = tvm.compute(oshape, lambda n, co, h, w:
                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
                         name='output_unpack', tag='spatial_conv2d_output')
    return output
Example #32
0
import nnpu
import tvm
import topi
from nnpu.utils import ScheduleProcHelper
import numpy as np

with (ScheduleProcHelper()):
    env = nnpu.get_env()
    nnpu.set_device(env, type='S0')
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']

    a = tvm.placeholder((2, 4, 16), dtype_n, 'a')
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a')

    pad_buf = tvm.compute((2, 6, 16), lambda i, j, k: tvm.expr.Select(
        j >= 2, a_buf[i, j - 2, k], tvm.const(0, dtype_n)), 'pad')
    nnpu.utils.MarkScope(pad_buf)
    nnpu.utils.PragmaCopy(pad_buf)
    tile_host, _ = nnpu.utils.CopyBufToH(pad_buf, 'tile')

    s = nnpu.create_schedule([tile_host.op])

    print(tvm.lower(s, [a, tile_host], simple_mode=True))
    print(nnpu.lower(s, [a, tile_host], simple_mode=True))
    # exit(0)
    func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(2, 4, 16),
                             dtype=a.dtype,
                             low=-128,
Example #33
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    if tile_size == 4:
        G_data = np.array([
            [1 / 4.0, 0, 0],
            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
            [1 / 24.0, 1 / 12.0, 1 / 6.0],
            [1 / 24.0, -1 / 12.0, 1 / 6.0],
            [0, 0, 1]], dtype=np.float32)

        B_data = np.array([
            [4, 0, 0, 0, 0, 0],
            [0, -4, 4, -2, 2, 4],
            [-5, -4, -4, -1, -1, 0],
            [0, 1, -1, 2, -2, -5],
            [1, 1, 1, 1, 1, 0],
            [0, 0, 0, 0, 0, 1]], out_dtype)

        A_data = np.array([
            [1, 0, 0, 0],
            [1, 1, 1, 1],
            [1, -1, 1, -1],
            [1, 2, 4, 8],
            [1, -2, 4, -8],
            [0, 0, 0, 1]], out_dtype)
    elif tile_size == 2:
        G_data = np.array([
            [1, 0, 0],
            [1.0/2, 1.0/2, 1.0/2],
            [1.0/2, -1.0/2, 1.0/2],
            [0, 0, 1]], np.float32)

        B_data = np.array([
            [1, 0, 0, 0],
            [0, 1, -1, 1],
            [-1, 1, 1, 0],
            [0, 0, 0, -1]], out_dtype)

        A_data = np.array([
            [1, 0],
            [1, 1],
            [1, -1],
            [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1
    K = CO
    C = CI

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m-1) // m, (W + m-1) // m
    P = N * nH * nW

    cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
    cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
    VP = cfg['tile_p'].size[-1]
    VK = cfg['tile_k'].size[-1]

    # pack input tile
    input_tile = tvm.compute((C, P // VP, alpha, alpha, VP),
                             lambda c, b, eps, nu, bb:
                             data_pad[(b*VP+bb) // (nH*nW)][c][(b*VP+bb) // nW % nH * m + eps]
                             [(b*VP+bb) % nW * m + nu],
                             name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk:
                        tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
                                G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')

    # transform image
    B = const_matrix(B_data, 'B')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb:
                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
                            B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')

    # batch gemm
    c = tvm.reduce_axis((0, C), name='c')
    M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
                    tvm.sum(U[eps][nu][k // VK][c][k % VK] *
                            V[eps][nu][b // VP][c][b % VP], axis=c), name='M')

    # inverse transform
    A = const_matrix(A_data, 'A')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
                            axis=[r_eps, r_nu]), name='Y')

    # unpack output
    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
                         name='output', tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * K * H * W * KH * KW * C)
    return output
Example #34
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
                   out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    idxd = tvm.indexdiv
    idxm = tvm.indexmod

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    K = CO
    C = CI

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    cfg.define_split('tile_p',
                     cfg.axis(P),
                     num_outputs=2,
                     filter=lambda x: x.size[-1] <= 16)
    cfg.define_split('tile_k',
                     cfg.axis(K),
                     num_outputs=2,
                     filter=lambda x: x.size[-1] <= 16)
    VP = cfg['tile_p'].size[-1]
    VK = cfg['tile_k'].size[-1]

    # pack input tile
    input_tile = tvm.compute((C, idxd(P, VP), alpha, alpha, VP),
                             lambda c, b, eps, nu, bb: data_pad[
                                 idxd(b * VP + bb, nH * nW), c,
                                 idxm(idxd(b * VP + bb, nW), nH) * m + eps,
                                 idxm(b * VP + bb, nW) * m + nu],
                             name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute(
            (alpha, alpha, idxd(K, VK), C, VK),
            lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][
                r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw],
                                              axis=[r_kh, r_kw]),
            name='U')

    # transform image
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    V = tvm.compute(
        (alpha, alpha, idxd(P, VP), C, VP),
        lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][
            bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu],
                                          axis=[r_eps, r_nu]),
        name='V')

    # batch gemm
    c = tvm.reduce_axis((0, C), name='c')
    M = tvm.compute((alpha, alpha, K, P),
                    lambda eps, nu, k, b: tvm.sum(U[eps][nu][idxd(k, VK)][c][
                        idxm(k, VK)] * V[eps][nu][idxd(b, VP)][c][idxm(b, VP)],
                                                  axis=c),
                    name='M')

    # inverse transform
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    Y = tvm.compute((K, P, m, m),
                    lambda k, b, vh, vw: tvm.sum(M[r_eps][r_nu][k][b] * A[
                        r_eps][vh] * A[r_nu][vw],
                                                 axis=[r_eps, r_nu]),
                    name='Y')

    # unpack output
    output = tvm.compute(
        (N, K, H, W),
        lambda n, k, h, w: Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m),
                                idxm(h, m),
                                idxm(w, m)],
        name='output',
        tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * K * H * W * KH * KW * C)
    return output
Example #35
0
def fused_convs(input_data, filters, resnet_block=False):

	out_dtype = input_data.dtype

	Input = None
	nodes = [input_data]
	params = [input_data]

	for f in filters:
		Input = nodes[-1]
		Filter = f.placeholder
		layout = f.layout
		depthwise = f.depthwise
		kernel = f.kernel
		stride = f.stride
		padding = f.padding
		dilation = f.dilation

		assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise

		padded_count = 0
		conv_count = 0
		depthwise_count = 0

		if isinstance(stride, int):
			stride_h = stride_w = stride
		else:
			stride_h, stride_w = stride

		if isinstance(dilation, int):
			dilation_h = dilation_w = dilation
		else:
			dilation_h, dilation_w = dilation

		batch, in_height, in_width, in_channel = Input.shape
		if f.NHWC_transpose: # HWOI
			kernel_h, kernel_w, tmp, kernel_channel = Filter.shape
		else: # HWIO
			kernel_h, kernel_w, kernel_channel, tmp = Filter.shape
		if depthwise:
			channel_multiplier = tmp
		else:
			num_filter = tmp

		# compute the output shape
		dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
		dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
		pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
			padding, (dilated_kernel_h, dilated_kernel_w))

		out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter
		out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
		out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

		if f.kernel > 1:
			print("Padding is needed!")

			pad_before = [0, pad_top, pad_left, 0]
			pad_after = [0, pad_down, pad_right, 0]

			PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count))
			padded_count += 1
			nodes.append(PaddedInput)

			# Update Input
			Input = PaddedInput
			batch, in_height, in_width, in_channel = Input.shape

		if not depthwise:
			rc = tvm.reduce_axis((0, in_channel), name='rc')
		if kernel > 1:
			ry = tvm.reduce_axis((0, kernel_h), name='ry')
			rx = tvm.reduce_axis((0, kernel_w), name='rx')

		if not depthwise: # Normal convolution
			if kernel > 1:
				Output = tvm.compute(
				(batch, out_height, out_width, out_channel),
				lambda nn, yy, xx, ff: tvm.sum(
					Input[nn, yy * stride_h + ry * dilation_h,
								xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
					(Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]),
					name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc")
			else: # Only reduce rc axis
				Output = tvm.compute(
				(batch, out_height, out_width, out_channel),
				lambda nn, yy, xx, ff: tvm.sum(
					Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) *
					(Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]),
					name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc")
			conv_count += 1
		else: # Depthwise convolution (kernel > 1)
			Output = tvm.compute(
			(batch, out_height, out_width, out_channel),
			lambda b, i, j, c: tvm.sum(
				(Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w,
							 tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) *
				(Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)),
				axis=[ry, rx]),
			name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc")
			depthwise_count += 1

		nodes.append(Output)
		params.append(Filter)

	if resnet_block:
		First = nodes[0]
		Last = nodes[-1]
		assert (first.shape == last.shape)
		Output = tvm.compute(
			(batch, out_height, out_width, out_channel),
			lambda b, i, j, c: tvm.sum(
				(First[b, i, j, c].astype(out_dtype) + 
				(Last[b, i, j, c]).astype(out_dtype))),
			name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc")
		nodes.append(Output)

	params.append(nodes[-1]) # Final output
	return nodes, params
Example #36
0
def _spatial_conv_all(wkl, sch, data, kernel, out_dtype):
    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    HCAT, WCAT = KH - 1, KW - 1

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    TH = H + 2 * HPAD
    TW = W + 2 * WPAD
    OH = (H + 2 * HPAD - KH) // HSTR + 1
    OW = (W + 2 * WPAD - KW) // WSTR + 1

    dshape = (1, CI, H, W)
    dpshape = (1, CI, TH, TW)
    dvshape = (1, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT,
               VW * WSTR + WCAT)

    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
        data_pad[n][ci][h * VH * HSTR + vh][w * VW * WSTR + vw], name='data_vec')

    kshape = (CO, CI, KH, KW)
    kvshape = (CO // VC, CI, KH, KW, VC)

    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
        kernel[co * VC + vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw].astype(out_dtype) *
                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
                axis=[ci, dh, dw]), name='conv')
    output = tvm.compute(oshape,
                         lambda n, co, h, w: conv[n][co // VC][h // VH][
                             w // VW][h % VH][w % VW][co % VC],
                         name='output_unpack',
                         tag='spatial_conv_output')

    s = tvm.create_schedule(conv.op)
    traverse(s, conv.op)

    # schedule for data_vec
    A0, A1 = data_pad, data_vec
    if DOPAD:
        s[A0].compute_inline()
    _, h, _, _, _, _ = s[A1].op.axis
    if sch.ba == 1:
        oaxis = h
        paxis = h
    else:
        oh, ih = s[A1].split(h, sch.ba)
        oaxis = oh
        paxis = ih
    s[A1].parallel(paxis)
    s[A1].pragma(oaxis, "parallel_launch_point")
    s[A1].pragma(paxis, "parallel_stride_pattern")
    s[A1].pragma(oaxis, "parallel_barrier_when_finish")

    # schedule for kernel_vec
    B, B0 = kernel, kernel_vec
    co, _, _, _, _ = s[B0].op.axis
    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[B0].split(co, sch.bc)
        oaxis = oco
        paxis = ico
    s[B0].parallel(paxis)
    s[B0].pragma(oaxis, "parallel_launch_point")
    s[B0].pragma(paxis, "parallel_stride_pattern")
    s[B0].pragma(oaxis, "parallel_barrier_when_finish")

    # schedule for conv & unpack
    C0, C = conv, output

    s = tvm.create_schedule(C.op)
    traverse(s, C.op)

    CC = s.cache_write(C0, "global")
    _, co, oh, ow, vh, vw, vc = s[C0].op.axis
    if UNROLL:
        s[C0].unroll(vw)
    s[C0].vectorize(vc)

    s[CC].compute_at(s[C0], ow)
    _, co, oh, ow, vh, vw, vc = s[CC].op.axis
    ci, dh, dw = s[CC].op.reduce_axis
    s[CC].reorder(ci, dh, vh, dw, vw, vc)

    if UNROLL:
        s[CC].unroll(vw)
    s[CC].vectorize(vc)

    n, co, h, w = s[C].op.axis
    co, vc = s[C].split(co, VC)
    oh, ow, vh, vw = s[C].tile(h, w, VH, VW)
    s[C].reorder(n, co, oh, ow, vh, vw, vc)
    # if C != C1:
    #     s[C1].compute_inline()
    s[C0].compute_at(s[C], ow)

    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[C].split(co, sch.bc)
        oaxis = oco
        paxis = ico

    s[C].parallel(paxis)
    s[C].pragma(oaxis, "parallel_launch_point")
    s[C].pragma(paxis, "parallel_stride_pattern")
    s[C].pragma(oaxis, "parallel_barrier_when_finish")

    return C, s
Example #37
0
    def _compile_function(dtype: str,
                          device: str,
                          b0: int = 4,
                          b1: int = 4,
                          b2: int = 16):
        '''Compiles a tvm function that computes diagonal_mm
        args:
        dtype: str in ['float64', 'float32', 'float16']
        device: str in ['cpu' or 'cuda']
        b0, b1, b2: size of tensor tiles. Very important for good performance

        '''
        import tvm  # import the full tvm library here for compilation. Don't import at the top of the file in case we don't need to compile
        from tvm.contrib import nvcc

        @tvm.register_func
        def tvm_callback_cuda_compile(code):
            """Use nvcc compiler for better perf."""
            ptx = nvcc.compile_cuda(
                code, target="ptx",
                arch='sm_52')  # use old arch for this to work on old GPUs
            return ptx

        assert dtype in ['float16', 'float32', 'float64']
        assert device in ['cpu', 'cuda']
        device = None if device == 'cpu' else device
        tgt_host = "llvm"

        b = tvm.var('b')  # batch size
        n = tvm.var('n')  # sequence length
        h = tvm.var('h')  # number of heads
        m = tvm.var('m')  # hidden dimension
        w = tvm.var('w')  # window size
        w_upper = tvm.var(
            'w_upper'
        )  # window size to the right of the word. Should be `0` or `w`
        padding = tvm.var('padding')  # padding
        transpose_t1 = tvm.var('transpose_t1')  # t1 should be transposed
        t1d3 = tvm.var('t1d3')  # last dimension of t1
        t3d3 = tvm.var('t3d3')  # last dimension of t3 (the result tensor)
        X = tvm.placeholder((b, n, h, t1d3), name='X',
                            dtype=dtype)  # first tensor
        Y = tvm.placeholder((b, n, h, m), name='Y',
                            dtype=dtype)  # second tensor
        k = tvm.reduce_axis((0, t1d3), name='k')  # dimension to sum over
        D = tvm.placeholder((h), name='D', dtype='int')  # dilation per head
        output_shape = (b, n, h, t3d3)  # shape of the result tensor
        algorithm = lambda l, i, q, j: tvm.sum(
            tvm.if_then_else(
                t3d3 ==
                m,  # if output dimension == m, then t1 is diagonaled (FIXME: This breaks if t3d3 == m == t1d3)
                tvm.if_then_else(
                    transpose_t1 == 0,
                    tvm.if_then_else(
                        tvm.all(
                            i + D[q] * (k - w) >= 0,
                            i + D[q] * (k - w) < n,
                        ),
                        X[l, i, q, k] * Y[l, i + D[q] *
                                          (k - w), q, j],  # t1 is diagonaled
                        padding),
                    tvm.if_then_else(
                        tvm.all(
                            i + D[q] * (k - w_upper) >=
                            0,  # `w_upper` to handle the case `autoregressive=True`
                            i + D[q] * (k - w_upper) < n,
                        ),
                        X[l, i + D[q] * (k - w_upper), q, (w_upper + w) - k] *
                        Y[l, i + D[q] * (k - w_upper), q, j
                          ],  # # t1 is diagonaled and should be transposed
                        padding),
                ),
                tvm.if_then_else(
                    tvm.all(
                        i + D[q] * (j - w) >= 0,
                        i + D[q] * (j - w) < n,
                    ),
                    X[l, i, q, k] *
                    Y[l, i + D[q] * (j - w), q, k
                      ],  # t1 is not diagonaled, but the output tensor is going to be
                    padding)),
            axis=k)

        Z = tvm.compute(output_shape, algorithm,
                        name='Z')  # automatically generate cuda code
        s = tvm.create_schedule(Z.op)

        print('Lowering: \n ===================== \n{}'.format(
            tvm.lower(s, [X, Y, D], simple_mode=True)))

        # split long axis into smaller chunks and assing each one to a separate GPU thread/block
        ko, ki = s[Z].split(Z.op.reduce_axis[0], factor=b0)
        ZF = s.rfactor(Z, ki)

        j_outer, j_inner = s[Z].split(s[Z].op.axis[-1], factor=b1)
        i_outer, i_inner = s[Z].split(s[Z].op.axis[1], factor=b2)

        s[Z].bind(j_outer, tvm.thread_axis("blockIdx.x"))
        s[Z].bind(j_inner, tvm.thread_axis("threadIdx.y"))

        s[Z].bind(i_outer, tvm.thread_axis("blockIdx.y"))
        s[Z].bind(i_inner, tvm.thread_axis("threadIdx.z"))

        tx = tvm.thread_axis("threadIdx.x")
        s[Z].bind(s[Z].op.reduce_axis[0], tx)
        s[ZF].compute_at(s[Z], s[Z].op.reduce_axis[0])
        s[Z].set_store_predicate(tx.var.equal(0))

        print('Lowering with GPU splits: \n ===================== \n{}'.format(
            tvm.lower(s, [X, Y, D], simple_mode=True)))

        # compiling the automatically generated cuda code
        diagonaled_mm = tvm.build(
            s, [X, Y, Z, D, w, w_upper, padding, transpose_t1, t3d3],
            target=device,
            target_host=tgt_host,
            name='diagonaled_mm')
        return diagonaled_mm
Example #38
0
def test_gemm_gpu(N, times, bn, num_block, num_thread):
    assert (bn <= N)
    assert (num_thread * num_thread * 16 <= N)
    assert (num_block * num_block * 2 <= N)
    A = tvm.placeholder((N, N), name='A')
    B = tvm.placeholder((N, N), name='Btmp')
    k = tvm.reduce_axis((0, N), name='k')

    packedB = tvm.compute((N, N / bn, bn),
                          lambda x, y, z: B[x, y * bn + z],
                          name='B')

    C = tvm.compute((N, N),
                    lambda ii, jj: tvm.sum(
                        A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k),
                    name='C')

    s = tvm.create_schedule(C.op)
    CC = s.cache_write(C, "local")

    block_x = tvm.thread_axis("blockIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_x = tvm.thread_axis("threadIdx.x")
    thread_y = tvm.thread_axis("threadIdx.y")

    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")

    pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread)
    pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread)
    s[packedB].bind(pby, thread_y)
    s[packedB].bind(pbx, thread_x)
    pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8)
    s[packedB].vectorize(pbk)

    by, yi = s[C].split(C.op.axis[0], nparts=num_block)
    bx, xi = s[C].split(C.op.axis[1], nparts=num_thread)

    s[C].bind(by, block_y)
    s[C].bind(bx, thread_y)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_block)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)

    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)

    s[C].bind(ty, block_x)
    s[C].bind(tx, thread_x)

    xyi, xxi = s[C].split(xi, factor=8)
    s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi)
    s[C].vectorize(xxi)

    s[CC].compute_at(s[C], yi)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)
    xo, xi = s[CC].split(xo, factor=8)
    s[CC].vectorize(xi)

    ko, ki = s[CC].split(k, factor=2)
    s[CC].unroll(ki)

    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
    temp = util.tempdir()
    path_dso = temp.relpath("gemm_gpu.so")
    f.export_library(path_dso, ndk.create_shared)

    # connect to the proxy
    remote = rpc.connect(proxy_host, proxy_port, key=key)
    ctx = remote.cl(0)
    remote.upload(path_dso)
    f = remote.load_module("gemm_gpu.so")

    evaluate(f, ctx, N, times)
Example #39
0
def test_rpc_remote_module():
    if not tvm.module.enabled("rpc"):
        return
    server = rpc.Server("localhost")
    client = rpc.connect(server.host, server.port)
    # graph
    n = tvm.convert(1024)
    A = tvm.placeholder((n, ), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
    s = tvm.create_schedule(B.op)

    def check_remote(remote):
        if not tvm.module.enabled("llvm"):
            print("Skip because llvm is not enabled")
            return
        temp = util.tempdir()
        ctx = remote.cpu(0)
        f = tvm.build(s, [A, B], "llvm", name="myadd")
        path_dso = temp.relpath("dev_lib.so")
        f.export_library(path_dso)
        remote.upload(path_dso)
        f1 = remote.load_module("dev_lib.so")
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
        cost = time_f(a, b).mean
        print('%g secs/op' % cost)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    def check_remote_link_cl(remote):
        """Test function to run remote code such as cl

        This is not enabled because there is forking issue
        of TVM runtime when server launches after OpenCL
        runtime initializes. We leave it as an example
        on how to do rpc when we want to do linking on remote.
        """
        if not tvm.module.enabled("llvm"):
            print("Skip because llvm is not enabled")
            return
        if not tvm.module.enabled("opencl"):
            print("Skip because opencl is not enabled")
            return
        temp = util.tempdir()
        ctx = remote.cl(0)
        s = tvm.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=32)
        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
        f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
        # Option 1: save modules separately and rely on remote compiler
        path_o = temp.relpath("myadd.o")
        path_cl = temp.relpath("myadd.cl")
        path_json = temp.relpath("myadd.tvm_meta.json")
        f.save(path_o)
        f.imported_modules[0].save(path_cl)
        remote.upload(path_o)
        remote.upload(path_cl)
        # upload meta data
        remote.upload(path_json)
        fhost = remote.load_module("myadd.o")
        fdev = remote.load_module("myadd.cl")
        fhost.import_module(fdev)
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
        # Option 2: export library as a tar ball then handled by remote compiler
        path_tar = temp.relpath("myadd.tar")
        f.export_library(path_tar)
        remote.upload(path_tar)
        fhost = remote.load_module("myadd.tar")
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    check_remote(client)
    check_remote(rpc.LocalSession())
Example #40
0
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits,
                       weight_bits, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
    sch = _get_schedule(wkl, "NHWC")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc

    data_q = bitpack(data,
                     activation_bits,
                     pack_axis=3,
                     bit_axis=3,
                     pack_type='uint8')
    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
    N, H, W, IB, CI = data_q.shape
    OCO, KH, KW, KB, VC, _ = kernel_vec.shape

    CO = OCO * VC
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH - 1, KW - 1

    PAD_H = H + 2 * HPAD
    PAD_W = W + 2 * WPAD
    OH = (H + 2 * HPAD - KH) // HSTR + 1
    OW = (W + 2 * WPAD - KW) // WSTR + 1
    dvshape = (N, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT,
               VW * WSTR + WCAT, IB, CI)
    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
    oshape = (1, OH, OW, CO)

    if (HPAD != 0 and WPAD != 0):
        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
    else:
        data_pad = data_q

    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')
    ib = tvm.reduce_axis((0, IB), name='ib')
    kb = tvm.reduce_axis((0, KB), name='kb')

    def _conv(n, h, w, co, vh, vw, vc):
        return tvm.sum(
            (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16')
                          & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw,
                                     ib, ci].astype('uint16')) <<
             (kb + ib).astype('uint16')),
            axis=[dh, dw, kb, ib, ci])

    conv = tvm.compute(ovshape, _conv, name='conv')

    return tvm.compute(oshape,
                       lambda n, h, w, co: conv[n][h // VH][w // VW][co // VC][
                           h % VH][w % VW][co % VC].astype(out_dtype),
                       name='output_vec',
                       tag='spatial_bitserial_conv_nhwc')
Example #41
0
def test_tensorize_tensor_compute_op():
    # an intrinsic called "multivadd" whose definition (pattern)
    # is a loop of another intrinsic called "vadd"
    def intrin_multivadd(n):
        n_a = tvm.var("n_a")
        Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a])

        n_b = tvm.var("n_b")
        Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b])

        n_c = tvm.var("n_c")
        Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c])

        z = tvm.compute((n, ), lambda i: tvm.call_extern(
            "float32", 'vadd', Ab.access_ptr("w", offset=n_a * i),
            Bb.access_ptr("r", offset=n_b * i),
            Cb.access_ptr("r", offset=n_c * i)))

        # replace the pattern with the multivadd call. I need to figure out
        # how to pass it the right parameters.
        def intrin_func(ins, outs):
            return tvm.call_packed("multivadd")

        with tvm.build_config():
            return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")

    def intrin_vadd(n):
        dtype = 'float32'
        x = tvm.placeholder((n, ), dtype=dtype, name='vx')
        y = tvm.placeholder((n, ), dtype=dtype, name='vy')
        z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
        s = tvm.create_schedule(z.op)

        def create_buffer(t):
            return tvm.decl_buffer(t.shape,
                                   t.dtype,
                                   name='W' + t.name,
                                   offset_factor=16)

        def intrin_func(ins, outs):
            ib = tvm.ir_builder.create()
            ib.emit(
                tvm.call_extern("float32", 'vadd', ins[0].access_ptr("r"),
                                ins[1].access_ptr('r'),
                                outs[0].access_ptr('wr')))
            return ib.get()

        with tvm.build_config(offset_factor=16):
            return tvm.decl_tensor_intrin(z.op,
                                          intrin_func,
                                          binds={
                                              x: create_buffer(x),
                                              y: create_buffer(y),
                                              z: create_buffer(z)
                                          })

    # cache_read, cache_write
    M = 1024
    factor = 16
    dtype = 'float32'

    A = tvm.placeholder((M // factor, factor), name="A", dtype=dtype)
    B = tvm.placeholder((M // factor, factor), name="B", dtype=dtype)

    vadd = intrin_vadd(factor)
    C = tvm.compute((M // factor, factor),
                    lambda i: vadd(A[i, 0:factor], B[i, 0:factor]),
                    name='C')

    s = tvm.create_schedule(C.op)
    multivadd = intrin_multivadd(64)
    s[C].tensorize(C.op.axis[0], multivadd)
    s = s.normalize()
    dom_map = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, dom_map)
    # The loop that we tried to tensorize still exists in the code
    # That means tensorize didn't work as expected
    assert isinstance(stmt.body.body.body, tvm.stmt.For)
    assert stmt.body.body.body.loop_var.name == C.op.axis[0].var.name
Example #42
0
def test_add_pipeline():
    n = tvm.var('n')
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='C')
    s = tvm.create_schedule(D.op)

    # GPU schedule have to split by gridIdx and threadIdx
    num_thread = 256
    xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
    s[C].bind(xo, tvm.thread_axis("threadIdx.x"))
    s[C].bind(xi, tvm.thread_axis("blockIdx.x"))

    xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
    s[D].bind(xo, tvm.thread_axis("threadIdx.x"))
    s[D].bind(xi, tvm.thread_axis("blockIdx.x"))

    # compile to IR
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    Cb = tvm.decl_buffer(C.shape, C.dtype, name='C')
    stmt = tvm.ir_pass.LoopPartition(stmt)
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cb}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0, True)
    fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)]
    fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])

    def check_target(device, host="stackvm"):
        if not tvm.module.enabled(host):
            return
        if not tvm.module.enabled(device):
            return
        ctx = tvm.context(device, 0)
        mhost = tvm.codegen.build_module(fsplits[0], host)
        mdev = tvm.codegen.build_module(fsplits[1:], device)
        mhost.import_module(mdev)
        code = mdev.get_source()
        f = mhost.entry_func
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
        f(a, b, c)
        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    def check_module_save(device, host="stackvm"):
        if not tvm.module.enabled(host):
            return
        if not tvm.module.enabled(device):
            return
        ctx = tvm.context(device, 0)
        fmt = "ptx" if device == "cuda" else "cl"
        mhost = tvm.codegen.build_module(fsplits[0], host)
        mdev = tvm.codegen.build_module(fsplits[1:], device)
        temp = util.tempdir()
        mpath = temp.relpath("test.%s" % fmt)
        mdev.save(mpath)
        mdev2 = tvm.module.load(mpath)
        mhost.import_module(mdev2)
        f = mhost.entry_func
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
        f(a, b, c)
        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    check_target("cuda", host="stackvm")
    check_target("cuda", host="llvm")
    check_module_save("cuda", host="stackvm")
    check_target("nvptx", host="llvm")
Example #43
0
def conv1d_ncw(data,
               kernel,
               strides=1,
               padding='VALID',
               dilation=1,
               out_dtype=None):
    """ 1D convolution forward operator for NCW layout.

    Parameters
    ----------
    data : tvm.Tensor
        3-D with shape [batch, in_channel, in_width]

    kernel : tvm.Tensor
        3-D with shape [num_filter, in_channel, filter_size]

    strides : int or tuple
        The spatial stride along width

    padding : int, tuple, or str
        Padding size can be an integer for equal padding,
        a tuple of (left, right) or a string in ['VALID', 'SAME'].

    dilation : int or tuple
        Dilation rate if convolution should be dilated.

    out_dtype : str
        The output data type. If None then output is same type as input.
    """
    if out_dtype is None:
        out_dtype = data.dtype
    if isinstance(strides, (tuple, list)):
        strides = strides[0]
    if isinstance(dilation, (tuple, list)):
        dilation = dilation[0]

    batch, in_channels, data_width = data.shape
    out_channels, _, kernel_size = kernel.shape

    # Compute the output shape
    dilated_kernel_size = (kernel_size - 1) * dilation + 1
    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, ))
    out_channels = simplify(out_channels)
    out_width = simplify(
        (data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1)

    # Apply padding
    pad_before = [0, 0, pad_left]
    pad_after = [0, 0, pad_right]
    temp = pad(data, pad_before, pad_after, name='pad_temp')

    # Compute graph
    rc = tvm.reduce_axis((0, in_channels), name='rc')
    rw = tvm.reduce_axis((0, kernel_size), name='rw')

    return tvm.compute(
        (batch, out_channels, out_width),
        lambda b, c, w: tvm.sum(
            temp[b, rc, w * strides + rw * dilation].astype(out_dtype)
            * kernel[c, rc, rw].astype(out_dtype),
            axis=[rc, rw]),
        tag="conv1d_ncw")
Example #44
0
    def check_device(device, target_device):
        if not tvm.runtime.enabled(target_device):
            print("Skip test because {} is not enabled.".format(target_device))
            return

        device_ctx = tvm.context(device)
        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
        shape = (4, )

        # Insert copy nodes for data transferring between add and sub nodes.
        # Transfers data from gpu to cpu.
        copy_add_sub = tvm.placeholder(shape, name="__copy0")
        # Transfers data from cpu to gpu.
        copy_sub_add = tvm.placeholder(shape, name="__copy1")

        # Create a module containing adds on the device.
        tensor_a = tvm.placeholder(shape, name="A")
        tensor_b = tvm.placeholder(shape, name="B")
        tensor_d = tvm.placeholder(shape, name="D")
        elemwise_add0 = tvm.compute(shape,
                                    lambda *i: tensor_a(*i) + tensor_b(*i),
                                    name="elemwise_add0")
        elemwise_add1 = tvm.compute(shape,
                                    lambda *i: copy_sub_add(*i) + tensor_d(*i),
                                    name="elemwise_add1")
        target = topi.cpp.TEST_create_target(device)
        add_schedule0 = topi.cpp.cuda.schedule_injective(
            target, [elemwise_add0])
        lower_add0 = tvm.lower(add_schedule0,
                               [tensor_a, tensor_b, elemwise_add0],
                               name="elemwise_add0")
        add_schedule1 = topi.cpp.cuda.schedule_injective(
            target, [elemwise_add1])
        lower_add1 = tvm.lower(add_schedule1,
                               [tensor_d, copy_sub_add, elemwise_add1],
                               name="elemwise_add1")
        # Create module for sub whose target is the host.
        tensor_c = tvm.placeholder(shape, name="C")
        elemwise_sub = tvm.compute(shape,
                                   lambda *i: copy_add_sub(*i) - tensor_c(*i),
                                   name="elemwise_sub")
        sub_schedule = tvm.create_schedule(elemwise_sub.op)
        lower_sub = tvm.lower(sub_schedule,
                              [copy_add_sub, tensor_c, elemwise_sub],
                              name="elemwise_sub")

        target_flist = {
            target_device: [lower_add0, lower_add1],
            target_host: [lower_sub]
        }
        mhost = tvm.build(target_flist, target_host=target_host)
        ctx = [host_ctx, device_ctx]
        params = {}
        params["A"] = tensor_a = np.random.uniform(size=shape).astype(
            tensor_a.dtype)
        params["B"] = tensor_b = np.random.uniform(size=shape).astype(
            tensor_b.dtype)
        params["C"] = tensor_c = np.random.uniform(size=shape).astype(
            tensor_c.dtype)
        params["D"] = tensor_d = np.random.uniform(size=shape).astype(
            tensor_d.dtype)

        def check_verify():
            mod = graph_runtime.create(graph, mhost, ctx)
            mod.set_input(**params)
            mod.run()
            out = mod.get_output(0, tvm.nd.empty(shape))
            np.testing.assert_equal(out.asnumpy(),
                                    tensor_a + tensor_b - tensor_c + tensor_d)

        def check_load_module():
            temp = util.tempdir()
            path_lib = temp.relpath("deploy.so")
            mhost.export_library(path_lib)
            with open(temp.relpath("deploy.json"), "w") as out_file:
                out_file.write(graph)
            loaded_lib = tvm.runtime.load_module(path_lib)
            loaded_graph = open(temp.relpath("deploy.json")).read()
            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
            mod.set_input(**params)
            mod.run()
            out = mod.get_output(0, tvm.nd.empty(shape))
            np.testing.assert_equal(out.asnumpy(),
                                    tensor_a + tensor_b - tensor_c + tensor_d)

        check_verify()
        check_load_module()
Example #45
0
def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'):
    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        stride_h, stride_w = stride
    else:
        stride_h, stride_w = stride, stride

    out_channel = num_filter
    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
    oshape = (batch, out_channel, out_height, out_width)

    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')

    block_w = 1
    block_h = 1
    if stride_h == 2:
        if num_filter + kernel_h == 515:
            block_h = 4
            block_w = 4
        else:
            block_h = 4
            block_w = 5
    elif kernel_h == 3:
        if num_filter == 512:
            block_h = 2
            block_w = 7
        else:
            block_h = 2
            block_w = 14
    elif kernel_h == 7 and padding == 3 and stride == 1:
        block_h = 3
        block_w = 4
    else:
        block_h = 1
        block_w = 16

    attrs = {'block_h': block_h, 'block_w' : block_w}
    c_h = out_height
    c_w = out_width

    if not out_width % block_w == 0:
        c_w = (out_width // block_w + 1) * block_w

    if not out_height % block_h == 0:
        c_h = (out_height // block_h + 1) * block_h

    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
    temp = pad(data, pad_before, pad_after, name="pad_temp")

    nv = 16
    if not num_filter % nv == 0:
        num_filter = (num_filter // nv + 1) * nv
        out_channel = num_filter

    cshape = (batch, out_channel // nv, c_h, c_w, nv)
    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)

    kernel_vec = tvm.compute(
        kvshape,
        lambda co, ci, kh, kw, vc:
        kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')

    conv = tvm.compute(
        cshape,
        lambda nn, ff, yy, xx, vc:\
          tvm.sum(
              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
              kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
              axis=[rc, ry, rx]), name='conv', attrs=attrs)

    output = tvm.compute(
        oshape,
        lambda nn, ff, yy, xx:
        conv[nn][ff//nv][yy][xx][ff%nv],
        name='output_unpack', tag='conv2d')

    return output
def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype,
                       num_tile):
    assert layout == "NCHW", "Only support NCHW"
    out_dtype = out_dtype or data.dtype

    N, CI, IH, IW = get_const_tuple(data.shape)
    _, CO, KH, KW = get_const_tuple(kernel.shape)

    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
    bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom
    bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)

    OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH
    OW = (IW - 1) * WSTR - pad_left - pad_right + KW

    dilated_input = dilate(data, [1, 1, HSTR, WSTR])
    data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left],
                   [0, 0, bpad_bottom, bpad_right])

    # ==================== define configuration space ====================
    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    if num_tile == 2:  # for arm cpu
        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
    elif num_tile == 3:  # for mali gpu
        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
    else:
        raise RuntimeError("Invalid num_tile")

    cfg.define_reorder("reorder_0", [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
                       policy='candidate',
                       candidate=[[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
                                  [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])

    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
    # ====================================================================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    dvshape = (N, OH // VH, OW // VW, CI, VH + KH - 1, VW + KW - 1)
    kvshape = (CO // VC, CI, KH, KW, VC)
    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, CO, OH, OW)

    data_vec = tvm.compute(
        dvshape,
        lambda n, h, w, ci, vh, vw: data_pad[n][ci][h * VH + vh][w * VW + vw],
        name='data_vec')

    kernel_vec = tvm.compute(
        kvshape,
        lambda co, ci, kh, kw, vc: kernel[ci][co * VC + vc][kh][kw],
        name='kernel_vec_conv2d_transpose')

    ci = tvm.reduce_axis((0, CI), name='ci')
    kh = tvm.reduce_axis((0, KH), name='kh')
    kw = tvm.reduce_axis((0, KW), name='kw')

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
                kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
                axis=[ci, kh, kw]), name='conv')

    output = tvm.compute(oshape,
                         lambda n, co, h, w: conv[n][co // VC][h // VH][
                             w // VW][h % VH][w % VW][co % VC],
                         name='output_unpack',
                         tag='spatial_conv2d_transpose_output')
    return output
Example #47
0
def _im2col_pack(wkl, sch, data, kernel, stride, padding, out_dtype):
    """ Compute convolution with im2col pack layout. """
    assert data.shape[
        0].value == 1, "im2col pack convolution only support batch size=1"

    N = 1
    H, W = wkl.height, wkl.width
    CI = wkl.in_filter
    CO = wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.hpad
    HSTR, WSTR = wkl.hstride, wkl.wstride

    OH = (H + 2 * HPAD - KH) // HSTR + 1
    OW = (W + 2 * WPAD - KW) // WSTR + 1

    P = sch.vp
    Q = sch.vq
    UNROLL = sch.unroll

    dshape = (N, CI, H, W)
    dpshape = (N, CI, H + 2 * HPAD, W + 2 * WPAD)
    dcshape = (N, OH, OW, CI, KH, KW)
    dvshape = (N, OH * OW // P, CI, KH, KW, P)

    kshape = (CO, CI, KH, KW)
    kvshape = (CO // Q, CI, KH, KW, Q)

    ovshape = (N, CO // Q, OH * OW // P, P, Q)
    oshape = (N, CO, OH, OW)

    ############### declaration

    DO_PAD = (wkl.hpad != 0 and wkl.wpad != 0)
    if DO_PAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    data_col = tvm.compute(dcshape, lambda n, oh, ow, ci, hk, wk: \
        data_pad[n][ci][oh*HSTR+hk][ow*WSTR+wk], name='data_col')

    data_vec = tvm.compute(dvshape, lambda n, im, ci, hk, wk, vim: \
        data_col[n][(im*P+vim)//OW][(im*P+vim)%OW][ci][hk][wk], name='data_vec')


    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
        kernel[co*Q+vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    hk = tvm.reduce_axis((0, KH), name='hk')
    wk = tvm.reduce_axis((0, KW), name='wk')

    conv = tvm.compute(ovshape, lambda n, co, im, vim, vco: \
        tvm.sum(data_vec[n][im][ci][hk][wk][vim].astype(out_dtype) *
                kernel_vec[co][ci][hk][wk][vco].astype(out_dtype),
                axis=[ci, hk, wk]), name='conv')

    output = tvm.compute(oshape, lambda n, co, h, w: \
                         conv[n][co//Q][(h*OW+w)//P][(h*OW+w)%P][co%Q],
                         name='output_vec', tag='im2col_conv_output')

    return output
Example #48
0
def test_tensorize_matmul():
    n = 1024
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m),
                    lambda i, j: tvm.sum(B[j, k] * A[i, k], axis=k),
                    name='C')

    def check(factor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        yo, yi = s[C].split(y, factor=factor)
        gemv = intrin_gemv(factor, l)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(
            tvm.ir_pass.CanonicalSimplify(body[0]),
            tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor(factor, rfactor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        s[C].reorder(yo, ro, yi, ri)
        gemv = intrin_gemv(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(
            tvm.ir_pass.CanonicalSimplify(body[0]),
            tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor_no_reset(factor, rfactor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        s[C].reorder(yo, ro, yi, ri)
        gemv = intrin_gemv_no_reset(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(
            tvm.ir_pass.CanonicalSimplify(body[0]),
            tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor_no_reset_multi_reduction(factor, rfactor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        roo, roi = s[C].split(ro, factor=2)
        s[C].reorder(yo, roo, roi, yi, ri)
        gemv = intrin_gemv_no_reset(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(
            tvm.ir_pass.CanonicalSimplify(body[0]),
            tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    check(16)
    check_rfactor(16, 16)
    check_rfactor_no_reset(16, 16)
    check_rfactor_no_reset_multi_reduction(16, 16)
Example #49
0
def _spatial_conv_only(wkl, sch, data_vec, kernel_vec, out_dtype):
    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    HCAT, WCAT = KH - 1, KW - 1

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    TH = H + 2 * HPAD
    TW = W + 2 * WPAD
    OH = (H + 2 * HPAD - KH) // HSTR + 1
    OW = (W + 2 * WPAD - KW) // WSTR + 1

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw].astype(out_dtype) *
                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
                axis=[ci, dh, dw]), name='conv')
    output = tvm.compute(oshape,
                         lambda n, co, h, w: conv[n][co // VC][h // VH][
                             w // VW][h % VH][w % VW][co % VC],
                         name='output_unpack',
                         tag='spatial_conv_output')

    C0, C = conv, output

    s = tvm.create_schedule(C.op)
    traverse(s, C.op)

    CC = s.cache_write(C0, "global")
    _, co, oh, ow, vh, vw, vc = s[C0].op.axis
    if UNROLL:
        s[C0].unroll(vw)
    s[C0].vectorize(vc)

    s[CC].compute_at(s[C0], ow)
    _, co, oh, ow, vh, vw, vc = s[CC].op.axis
    ci, dh, dw = s[CC].op.reduce_axis
    s[CC].reorder(ci, dh, vh, dw, vw, vc)

    if UNROLL:
        s[CC].unroll(vw)
    s[CC].vectorize(vc)

    n, co, h, w = s[C].op.axis
    co, vc = s[C].split(co, VC)
    oh, ow, vh, vw = s[C].tile(h, w, VH, VW)
    s[C].reorder(n, co, oh, ow, vh, vw, vc)
    # if C != C1:
    #     s[C1].compute_inline()
    s[C0].compute_at(s[C], ow)

    if sch.bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[C].split(co, sch.bc)
        oaxis = oco
        paxis = ico

    s[C].parallel(paxis)
    s[C].pragma(oaxis, "parallel_launch_point")
    s[C].pragma(paxis, "parallel_stride_pattern")
    s[C].pragma(oaxis, "parallel_barrier_when_finish")

    return C, s
Example #50
0
 def _compute(attrs, x, _):
     x = x[0]
     scalar = attrs.get_float("scalar")
     scalar = tvm.const(scalar, x.dtype)
     return tvm.compute(x.shape, lambda *i: f(x(*i), scalar))
Example #51
0
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed):
    """Compute declaration for winograd"""
    assert layout == 'NCHW'

    tile_size = _infer_tile_size(data, kernel)

    N, CI, H, W = get_const_tuple(data.shape)

    if not pre_computed: # kernel tensor is raw tensor, do strict check
        if isinstance(dilation, int):
            dilation_h = dilation_w = dilation
        else:
            dilation_h, dilation_w = dilation
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))

        CO, CI, KH, KW = get_const_tuple(kernel.shape)
        HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
        HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
        assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
    else:                   # kernel tensor is pre-transfomred. this op is created by
                            # alter op layout, do not check
        # dilation is not supported
        HSTR = WSTR = 1
        HPAD = WPAD = 1
        KH = KW = 3
        _, _, CI, CO = get_const_tuple(kernel.shape)

    data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad")

    if tile_size == 4:
        G_data = np.array([
            [1 / 4.0, 0, 0],
            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
            [1 / 24.0, 1 / 12.0, 1 / 6.0],
            [1 / 24.0, -1 / 12.0, 1 / 6.0],
            [0, 0, 1]], dtype=np.float32)

        B_data = np.array([
            [4, 0, 0, 0, 0, 0],
            [0, -4, 4, -2, 2, 4],
            [-5, -4, -4, -1, -1, 0],
            [0, 1, -1, 2, -2, -5],
            [1, 1, 1, 1, 1, 0],
            [0, 0, 0, 0, 0, 1]], out_dtype)

        A_data = np.array([
            [1, 0, 0, 0],
            [1, 1, 1, 1],
            [1, -1, 1, -1],
            [1, 2, 4, 8],
            [1, -2, 4, -8],
            [0, 0, 0, 1]], out_dtype)
    elif tile_size == 2:
        G_data = np.array([
            [1, 0, 0],
            [1.0/2, 1.0/2, 1.0/2],
            [1.0/2, -1.0/2, 1.0/2],
            [0, 0, 1]], np.float32)

        B_data = np.array([
            [1, 0, 0, 0],
            [0, 1, -1, 1],
            [-1, 1, 1, 0],
            [0, 0, 0, -1]], out_dtype)

        A_data = np.array([
            [1, 0],
            [1, 1],
            [1, -1],
            [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1
    H = (H + 2 * HPAD - KH) // HSTR + 1
    W = (W + 2 * WPAD - KW) // WSTR + 1
    nH, nW = (H + m-1) // m, (W + m-1) // m
    P = N * nH * nW

    # transform kernel
    if not pre_computed:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
        kernel_pack = tvm.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
                                  tvm.sum(kernel[co][ci][r_kh][r_kw] *
                                          G[eps][r_kh] * G[nu][r_kw],
                                          axis=[r_kh, r_kw]), name='kernel_pack')
    else:
        kernel_pack = kernel

    # pack input tile
    input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu:
                             data_pad[p // (nH * nW)][c][p // nW % nH * m + eps]
                             [p % nW * m + nu], name='d')

    # transform data
    B = const_matrix(B_data)
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
                            tvm.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
                                    axis=[r_a, r_b]), name='data_pack')

    # do batch gemm
    ci = tvm.reduce_axis((0, CI), name='ci')
    bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
                        tvm.sum(kernel_pack[eps][nu][ci][co] *
                                data_pack[eps][nu][ci][p],
                                axis=[ci]), name='bgemm')

    # inverse transform
    A = const_matrix(A_data)
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    inverse = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
                          tvm.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
                                  axis=[r_a, r_b]), name='inverse')

    # output
    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
                         inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m],
                         name='output', tag='conv2d_nchw_winograd')
    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)

    return output
Example #52
0
import tvm
import numpy as np

######################################################################
# Direct Declare Extern Math Call
# -------------------------------
# The most straight-forward way to call target specific function is via
# extern function call construct in tvm.
# In th following example, we use :any:`tvm.call_pure_extern` to call
# :code:`__expf` function, which is only available under CUDA.
#
n = tvm.var("n")
A = tvm.placeholder((n, ), name='A')
B = tvm.compute(A.shape,
                lambda i: tvm.call_pure_extern("float32", "__expf", A[i]),
                name="B")
s = tvm.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
f = tvm.build(s, [A, B], "rocm", name="myexp")
print(f.get_source())

######################################################################
# Unified Intrinsic Call
# ----------------------
# The above code verifies that direct external call can be used to
# call into device specific functions.
# However, the above way only works for CUDA target with float type.
Example #53
0
def _spatial_pack(data, kernel, stride, padding, out_dtype=None):
    """ Compute convolution with pack on spatial axes. """
    if out_dtype is None:
        out_dtype = data.dtype
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    HCAT, WCAT = KH - 1, KW - 1

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    TH = H + 2 * HPAD
    TW = W + 2 * WPAD
    OH = (H + 2 * HPAD - KH) // HSTR + 1
    OW = (W + 2 * WPAD - KW) // WSTR + 1

    dshape = (1, CI, H, W)
    dpshape = (1, CI, TH, TW)
    dvshape = (1, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT,
               VW * WSTR + WCAT)

    kshape = (CO, CI, KH, KW)
    kvshape = (CO / VC, CI, KH, KW, VC)

    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
        data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')

    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
        kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) *
                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
                axis=[ci, dh, dw]), name='conv')

    output = tvm.compute(oshape,
                         lambda n, co, h, w: conv[n][co // VC][h / VH][w // VW]
                         [h % VH][w % VW][co % VC],
                         name='output_unpack',
                         tag='spatial_conv_output')

    return output
Example #54
0
def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation,
                           layout, out_dtype):
    out_dtype = data.dtype if out_dtype is None else out_dtype
    assert layout == 'NCHW', "only support NCHW convolution for AVX"

    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(dilation, int):
        dilation_h, dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    HPAD, WPAD = padding
    HSTR, WSTR = strides

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)

    pad_height = in_height + 2 * HPAD
    pad_width = in_width + 2 * WPAD

    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
    out_height = (in_height + 2 * HPAD - dilated_kernel_h) // HSTR + 1
    out_width = (in_width + 2 * WPAD - dilated_kernel_w) // WSTR + 1

    # pack data
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    # fetch schedule
    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

    shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width)
    data_vec = tvm.compute(
        shape,
        lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w],
        name='data_vec')

    # pack kernel
    shape = (num_filter // oc_bn, in_channel // ic_bn, kernel_height,
             kernel_width, ic_bn, oc_bn)
    kernel_vec = tvm.compute(shape,
                             lambda CO, CI, h, w, ci, co: kernel[
                                 CO * oc_bn + co, CI * ic_bn + ci, h, w],
                             name='kernel_vec')

    # convolution
    oshape = (batch_size, num_filter // oc_bn, out_height, out_width, oc_bn)
    unpack_shape = (batch_size, num_filter, out_height, out_width)

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    conv = tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[
            n, ic // ic_bn, oh * HSTR + kh * dilation_h, ic % ic_bn, ow * WSTR
            + kw * dilation_w].astype(out_dtype) * kernel_vec[
                oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block].astype(
                    out_dtype),
                                                      axis=[ic, kh, kw]),
        name='conv')

    unpack = tvm.compute(unpack_shape,
                         lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn
                                                 ].astype(out_dtype),
                         name='output_unpack',
                         tag='conv2d_nchw')
    return unpack
Example #55
0
def non_max_suppression_gpu(data,
                            valid_count,
                            max_output_size=-1,
                            iou_threshold=0.5,
                            force_suppress=False,
                            top_k=-1,
                            coord_start=2,
                            score_index=1,
                            id_index=0,
                            return_indices=True,
                            invalid_to_bottom=False):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    max_output_size : optional, int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : optional, float
        Non-maximum suppression threshold.

    force_suppress : optional, boolean
        Whether to suppress all detections regardless of class_id.

    top_k : optional, int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : required, int
        Start index of the consecutive 4 coordinates.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    return_indices : boolean
        Whether to return box indices in input data.

    invalid_to_bottom : optional, boolean
        Whether to move all valid bounding boxes to the top.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].

    Example
    --------
    .. code-block:: python

        # An example to use nms
        dshape = (1, 5, 6)
        data = tvm.placeholder(dshape, name="data")
        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
        iou_threshold = 0.7
        force_suppress = True
        top_k = -1
        out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
                                 force_suppress=force_supress, top_k=top_k, return_indices=False)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "cuda")
        ctx = tvm.gpu(0)
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]

    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer(valid_count.shape,
                                      valid_count_dtype,
                                      "valid_count_buf",
                                      data_alignment=4)
    score_axis = score_index
    score_shape = (batch_size, num_anchors)
    score_tensor = tvm.compute(score_shape,
                               lambda i, j: data[i, j, score_axis],
                               tag=tag.ELEMWISE)
    sort_tensor = argsort(score_tensor,
                          valid_count=valid_count,
                          axis=1,
                          is_ascend=False)

    sort_tensor_buf = api.decl_buffer(sort_tensor.shape,
                                      sort_tensor.dtype,
                                      "sort_tensor_buf",
                                      data_alignment=8)

    data_buf = api.decl_buffer(data.shape,
                               data.dtype,
                               "data_buf",
                               data_alignment=8)

    out_buf = api.decl_buffer(data.shape,
                              data.dtype,
                              "out_buf",
                              data_alignment=8)

    out, box_indices = \
        tvm.extern([data.shape, score_shape],
                   [data, sort_tensor, valid_count],
                   lambda ins, outs: nms_ir(
                       ins[0], ins[1], ins[2], outs[0], outs[1],
                       max_output_size, iou_threshold, force_suppress,
                       top_k, coord_start, id_index, score_index),
                   dtype=[data.dtype, "int32"],
                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                   name="nms",
                   tag="nms")

    if return_indices:
        return box_indices

    if invalid_to_bottom:
        output_buf = api.decl_buffer(data.shape,
                                     data.dtype,
                                     "output_buf",
                                     data_alignment=8)
        temp_flag_buf = api.decl_buffer(score_shape,
                                        valid_count_dtype,
                                        "temp_flag",
                                        data_alignment=8)
        temp_idx_buf = api.decl_buffer(score_shape,
                                       valid_count_dtype,
                                       "temp_idx",
                                       data_alignment=8)
        temp_flag, temp_idx = tvm.extern(
            [score_shape, score_shape], [out],
            lambda ins, outs: invalid_to_bottom_pre(ins[0], outs[0], outs[1]),
            dtype=["int32", "int32"],
            in_buffers=[out_buf],
            out_buffers=[temp_flag_buf, temp_idx_buf],
            name="invalid_to_bottom_phase_one")

        output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
                            lambda ins, outs: invalid_to_bottom_ir(
                                ins[0], ins[1], ins[2], outs[0]),
                            dtype=[data.dtype],
                            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
                            out_buffers=[output_buf],
                            name="invalid_to_bottom",
                            tag="invalid_to_bottom")
        return output

    return out
Example #56
0
import tvm

n = 1024
A = tvm.placeholder((n,), name='A')
k = tvm.reduce_axis((0, n), name='k')

B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')

s = tvm.create_schedule(B.op)

print(tvm.lower(s, [A, B], simple_mode=True))
print("---------cutting line---------")

ko, ki = s[B].split(B.op.reduce_axis[0], factor=32)

print(tvm.lower(s, [A, B], simple_mode=True))
Example #57
0
import numpy as np

######################################################################
# Define Matrix Multiplication
# ----------------------------
# Take matrix multiplication as our example.
# Matmul first multiply the corresponding elements between two matrix,
# then accumulate across a certain axis.
# The following lines describe the computation :code:`A * B^T` in TVM.
#
N, M, L = 1024, 512, 64
A = tvm.placeholder((N, L), name='A')
B = tvm.placeholder((M, L), name='B')
k = tvm.reduce_axis((0, L), name='k')
C = tvm.compute((N, M),
                lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k),
                name='C')
s = tvm.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

######################################################################
# Schedule the Matmul
# -------------------
# Now, suppose we have an accelerator that supports
# matrix-vector multiplication (GEMV) as a hardware primitive,
# which can take arbitrary size of reduce axis,
# but another axis needs to be no larger than 16.
# Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV.
#
factor = 16
x, y = C.op.axis
Example #58
0
# .. note::
#
#   Now we back to the local machine, which has a full TVM installed
#   (with LLVM).
#
# Here we will declare a simple kernel on the local machine:

import numpy as np

import tvm
from tvm import rpc
from tvm.contrib import util

n = tvm.convert(1024)
A = tvm.placeholder((n, ), name='A')
B = tvm.compute((n, ), lambda i: A[i] + 1.0, name='B')
s = tvm.create_schedule(B.op)

######################################################################
# Then we cross compile the kernel.
# The target should be 'llvm -target=armv7l-linux-gnueabihf' for
# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
# on our webpage building server. See the detailed note in the following block.

local_demo = True

if local_demo:
    target = 'llvm'
else:
    target = 'llvm -target=armv7l-linux-gnueabihf'
def intrinsic_gemm(i, j, k, il, jl, kl, ic, jc, kc):
    """
    (i, k) * (k, j)
    i, j, k: normal iteration size
    il, jl, kl: last iteration size
    ic, jc, kc: last iteration condition
    """
    assert i * k + k * j <= 256 * 1024, 'input too large for scratchpad'
    assert 4 * (i * j) <= 64 * 1024, 'input too large for accumulator'

    a = tvm.placeholder((i, k), name='a', dtype=dtype)
    b = tvm.placeholder((k, j), name='b', dtype=dtype)
    kk = tvm.reduce_axis((0, k), name='k')
    c = tvm.compute((i, j),
                    lambda ii, jj: tvm.sum(a[ii, kk] * b[kk, jj], axis=kk),
                    name='c')

    strideA = tvm.var("sA")
    Ab = tvm.decl_buffer(a.shape,
                         a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[strideA, 1])
    strideB = tvm.var("sB")
    Bb = tvm.decl_buffer(b.shape,
                         b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[strideB, 1])
    strideC = tvm.var("sC")
    Cb = tvm.decl_buffer(c.shape,
                         c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[strideC, 1])

    II = i // DIM + (0 if i % DIM == 0 else 1)
    JJ = j // DIM + (0 if j % DIM == 0 else 1)
    KK = k // DIM + (0 if k % DIM == 0 else 1)
    pad_I = 0 if i % DIM == 0 else (DIM - i % DIM)
    pad_J = 0 if j % DIM == 0 else (DIM - j % DIM)
    pad_K = 0 if k % DIM == 0 else (DIM - k % DIM)

    IIl = il // DIM + (0 if il % DIM == 0 else 1)
    JJl = jl // DIM + (0 if jl % DIM == 0 else 1)
    KKl = kl // DIM + (0 if kl % DIM == 0 else 1)
    pad_Il = 0 if il % DIM == 0 else (DIM - il % DIM)
    pad_Jl = 0 if jl % DIM == 0 else (DIM - jl % DIM)
    pad_Kl = 0 if kl % DIM == 0 else (DIM - kl % DIM)

    II = tvm.if_then_else(ic, IIl, II)
    JJ = tvm.if_then_else(jc, JJl, JJ)
    KK = tvm.if_then_else(kc, KKl, KK)
    pad_I = tvm.if_then_else(ic, pad_Il, pad_I)
    pad_J = tvm.if_then_else(jc, pad_Jl, pad_J)
    pad_K = tvm.if_then_else(kc, pad_Kl, pad_K)

    # reset-update-finalize
    def intrin_func(ins, outs):
        aa, bb = ins
        cc, = outs

        def _body():
            ib = tvm.ir_builder.create()
            # int32_t matmul_kernel(const elem_t *A, const elem_t *B, const acc_t *D,
            #          elem_t *C, int32_t I, int32_t J, int32_t K, int32_t pad_I,
            #          int32_t pad_J, int32_t pad_K, int32_t A_row_len,
            #          int32_t B_row_len, int32_t D_row_len, int32_t C_row_len,
            #          bool no_bias, bool repeating_bias);
            # D is set to a dummy address 1 to determine whether to overwrite
            # accumulator contents: on the first run, 1 will be retained and
            # overwrite the value in the accumulator; on subsequent runs D will be
            # replaced by NULL and C will accumulate on top of the accumulator's contents
            # This is controlled via bit 1 << (ADDR_LEN - 2) - see kernel source
            ib.emit(
                tvm.call_extern("int32", "matmul_kernel", aa.access_ptr("r"),
                                bb.access_ptr("r"), 1, cc.access_ptr("rw"), II,
                                JJ, KK, pad_I, pad_J, pad_K, strideA, strideB,
                                0, strideC, True, False))
            return ib.get()

        def _reset():
            ib = tvm.ir_builder.create()
            # int32_t matmul_reset(elem_t *C, int32_t I, int32_t J, int32_t pad_I,
            #         int32_t pad_J, int32_t C_row_len);
            ib.emit(
                tvm.call_extern("int32", "matmul_reset", cc.access_ptr("w"),
                                II, JJ, pad_I, pad_J, strideC))
            return ib.get()

        def _finalize():
            ib = tvm.ir_builder.create()
            # Move out C from accumulator
            # int32_t matmul_finalize(elem_t *C, int32_t I, int32_t J, int32_t pad_I,
            #         int32_t pad_J, int32_t C_row_len);
            ib.emit(
                tvm.call_extern("int32", "matmul_finalize",
                                cc.access_ptr("rw"), II, JJ, pad_I, pad_J,
                                strideC))
            return ib.get()

        # standalone (without reduce axis split), reset, update
        return None, _reset(), _body(), _finalize()

    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op,
                                      intrin_func,
                                      binds={
                                          a: Ab,
                                          b: Bb,
                                          c: Cb
                                      },
                                      name="sp_gemm")
Example #60
0
ctx = tvm.context(target, 0)

# ground truth
a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx)
b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx)
c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx)
answer = np.dot(a.asnumpy(), b.asnumpy())

###################
# TVM part
# Algorithm
k = tvm.reduce_axis((0, K), 'k')
A = tvm.placeholder((M, K), name='A')
B = tvm.placeholder((K, N), name='B')
C = tvm.compute((M, N),
           lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
           name='C')

s = tvm.create_schedule(C.op)

# Blocking by loop tiling
bn = 64
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], x_factor=bn, y_factor=bn)
k, = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=8)

# Hoist reduction domain outside the blocking loop
s[C].reorder(xo, yo, ko, ki, xi, yi)

# Vectorization
s[C].vectorize(yi)