Exemple #1
0
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
    target = tvm.target.current_target()
    if "cblas" in target.libs:
        C = cblas.matmul(data, weight, False, True)
    else:
        if out_dtype is None:
            out_dtype = data.dtype
        batch, in_dim = get_const_tuple(data.shape)
        out_dim, _ = get_const_tuple(weight.shape)
        # create tuning space
        cfg.define_split("tile_x", out_dim, num_outputs=2)
        cfg.define_split("tile_y", batch, num_outputs=2)
        cfg.define_split("tile_k", in_dim, num_outputs=2)
        if cfg.is_fallback:
            _default_dense_nopack_config(cfg, batch, out_dim, in_dim)

        vec = cfg["tile_k"].size[-1]
        k = tvm.reduce_axis((0, in_dim // vec), "k")
        CC = tvm.compute(
            (batch, out_dim, vec),
            lambda z, y, x: tvm.sum(data[z, k * vec + x].astype(out_dtype) *
                                    weight[y, k * vec + x].astype(out_dtype),
                                    axis=k))

        kk = tvm.reduce_axis((0, vec), "kk")
        C = tvm.compute((batch, out_dim),
                        lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
                        tag="dense_nopack")
    if bias is not None:
        C = tvm.compute((batch, out_dim),
                        lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)

    return C
def verify_matmul_add(m, l, n, transa=False, transb=False, dtype=tvm.float32):
    bias = tvm.var('bias', dtype=dtype)
    ashape = (l, n) if transa else (n, l)
    bshape = (m, l) if transb else (l, m)
    A = tvm.placeholder(ashape, name='A', dtype=dtype)
    B = tvm.placeholder(bshape, name='B', dtype=dtype)
    C = cblas.matmul(A, B, transa, transb)
    D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
    s = tvm.create_schedule(D.op)

    def get_numpy(a, b, bb, transa, transb):
        if transa:
            a = a.transpose()
        if transb:
            b = b.transpose()
        return np.dot(a, b) + bb

    def verify(target="llvm"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.cpu(0)
        f = tvm.build(s, [A, B, D, bias], target)
        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
        bb = 10.0
        f(a, b, d, bb)
        tvm.testing.assert_allclose(
            d.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), bb, transa, transb), rtol=1e-5)
    verify()
Exemple #3
0
def test_matmul_add():
    n = 1024
    l = 128
    m = 235
    bias = tvm.var('bias', dtype=tvm.float32)
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = cblas.matmul(A, B)
    D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D")
    s = tvm.create_schedule(D.op)

    def verify(target="llvm"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.cpu(0)
        f = tvm.build(s, [A, B, D, bias], target)
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
        bb = 10.0
        f(a, b, d, bb)
        tvm.testing.assert_allclose(d.asnumpy(),
                                    np.dot(a.asnumpy(), b.asnumpy()) + bb,
                                    rtol=1e-5)

    verify()
Exemple #4
0
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
    target = tvm.target.current_target()
    if "cblas" in target.libs:
        C = cblas.matmul(data, weight, False, True)
    else:
        if out_dtype is None:
            out_dtype = data.dtype
        batch, in_dim = get_const_tuple(data.shape)
        out_dim, _ = get_const_tuple(weight.shape)
        # create tuning space
        cfg.define_split("tile_y", batch, num_outputs=3)
        cfg.define_split("tile_x", out_dim, num_outputs=3)
        cfg.define_split("tile_k", in_dim, num_outputs=2)
        if cfg.is_fallback:
            _default_dense_pack_config(cfg, batch, out_dim, in_dim)

        packw_bn = cfg["tile_x"].size[-1]
        packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
        packw = tvm.compute(packw_shape,
                            lambda z, y, x: weight[z * packw_bn + x, y],
                            name="packed_weight")

        k = tvm.reduce_axis((0, in_dim), name="k")
        C = tvm.compute(
            (batch, out_dim),
            lambda y, x: tvm.sum(data[y, k].astype(out_dtype) * packw[
                x // packw_bn, k, x % packw_bn].astype(out_dtype),
                                 axis=k),
            tag="dense_pack")
    if bias is not None:
        C = tvm.compute((batch, out_dim),
                        lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)
    return C
Exemple #5
0
def test_matmul_add():
    n = 1024
    l = 128
    m = 235
    bias = tvm.var('bias', dtype=tvm.float32)
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = cblas.matmul(A, B)
    D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
    s = tvm.create_schedule(D.op)

    def verify(target="llvm"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.cpu(0)
        f = tvm.build(s, [A, B, D, bias], target)
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
        bb = 10.0
        f(a, b, d, bb)
        tvm.testing.assert_allclose(
            d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5)
    verify()
Exemple #6
0
def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
    """Compute dense using cblas library"""
    M, K = get_const_tuple(data.shape)
    N, _ = get_const_tuple(weight.shape)
    cfg.add_flop(M * K * N * 2)
    C = cblas.matmul(data, weight, False, True)
    if bias is not None:
        C = tvm.compute(C.shape,
                        lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)
    return C
Exemple #7
0
def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None):
    target = tvm.target.current_target()
    if "cblas" in target.libs:
        C = cblas.matmul(data, weight, False, True)
        if bias is not None:
            C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                            tag=tag.BROADCAST)
        return C

    M, _ = get_const_tuple(data.shape)
    # For small batch sizes, don't pack weight into cache-friendly layout
    # because of overhead in packing and limited reuse from batch dimension
    # TODO(icemelon9): use a more systematic way to determine which schedule to use
    if M <= 16:
        return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype)
    return _declaration_dense_pack(cfg, data, weight, bias, out_dtype)
Exemple #8
0
def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
    """Compute dense using cblas library"""
    M, K = get_const_tuple(data.shape)
    N, _ = get_const_tuple(weight.shape)
    cfg.add_flop(M * K * N * 2)
    if data.dtype == 'uint8' and weight.dtype == 'int8' and out_dtype == 'int32':
        C = cblas.matmul_u8s8s32(data, weight, False, True, dtype=out_dtype)
    elif data.dtype == 'float32':
        C = cblas.matmul(data, weight, False, True)
    else:
        raise NotImplementedError(f"Dense with cblas for {data.dtype} is not supported")

    if bias is not None:
        C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                       tag=tag.BROADCAST)
    return C
Exemple #9
0
def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None):
    target = tvm.target.current_target()
    if "cblas" in target.libs:
        C = cblas.matmul(data, weight, False, True)
        if bias is not None:
            C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j],
                            tag=tag.BROADCAST)
        return C

    M, _ = get_const_tuple(data.shape)
    # Always use dense_nopack for dynamic input.
    # This is a temporary for CV models.
    # TODO(kevinthesun): use kernel dispatcher instead.
    if isinstance(M, tvm.expr.Var):
        return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype)

    # For small batch sizes, don't pack weight into cache-friendly layout
    # because of overhead in packing and limited reuse from batch dimension
    # TODO(icemelon9): use a more systematic way to determine which schedule to use
    if M <= 16:
        return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype)
    return _declaration_dense_pack(cfg, data, weight, bias, out_dtype)
Exemple #10
0
d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
bb = 10.0
f(a, b, d, bb)
tvm.testing.assert_allclose(d.asnumpy(),
                            np.dot(a.asnumpy(), b.asnumpy()) + 10,
                            rtol=1e-5)

######################################################################
# Extern Contrib Wrappers
# -----------------------
# TVM also provide extern contrib wrappers to useful extern calls,
# the following line is equivalent to the previous example.
#
from tvm.contrib import cblas

C = cblas.matmul(A, B)
D = te.compute(C.shape, lambda i, j: C[i, j] + bias, name="D")
s = te.create_schedule(D.op)


######################################################################
# Hook Python Function as Extern
# ------------------------------
# Since we can call into any PackedFunc in TVM. We can use the extern
# function to callback into python.
#
# The following example registers a python function into TVM runtime system
# and use it to complete one stage of the computation.
# This makes TVM much more flexible. For example, we can insert front-end
# callbacks to inspect the intermediate results or mix customized code
# with TVM.
Exemple #11
0
a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
bb = 10.0
f(a, b, d, bb)
tvm.testing.assert_allclose(
    d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)

######################################################################
# Extern Contrib Wrappers
# -----------------------
# TVM also provide extern contrib wrappers to useful extern calls,
# the following line is equivalent to the previous example.
#
from tvm.contrib import cblas
C = cblas.matmul(A, B)
D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
s = tvm.create_schedule(D.op)

######################################################################
# Hook Python Function as Extern
# ------------------------------
# Since we can call into any PackedFunc in TVM. We can use the extern
# function to callback into python.
#
# The following example registers a python function into TVM runtime system
# and use it to complete one stage of the computation.
# This makes TVM much more flexible. For example, we can insert front-end
# callbacks to inspect the intermediate results or mix customized code
# with TVM.
#