Ejemplo n.º 1
0
def make_matrix_softmax_cross_entropy(shape,
                                      tgt,
                                      tgt_host,
                                      func_name,
                                      dtype="float32"):
    """Hint: output shape should be (1,)"""
    y = tvm.placeholder(shape, dtype=dtype, name="y")
    y_ = tvm.placeholder(shape, dtype=dtype, name="y_")
    t = -topi.sum(y_ * topi.log(topi.nn.softmax(y)), axis=1)
    c = topi.sum(t, keepdims=True) / shape[0]
    s = tvm.create_schedule(c.op)
    f = tvm.build(s, [y, y_, c], tgt, target_host=tgt_host, name=func_name)
    return f
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"):
    A = tvm.placeholder(shape, dtype=dtype, name="A")
    C = topi.sum(A, axis=0, keepdims=False)

    s = tvm.create_schedule(C.op)
    f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name)
    return f
Ejemplo n.º 3
0
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"):
    A = tvm.placeholder(shape, dtype=dtype, name="A")
    C = topi.sum(A, axis=0, keepdims=False)

    s = tvm.create_schedule(C.op)
    f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name)
    return f
Ejemplo n.º 4
0
def l2norm_instance(data, eps, axis=None):
    """Perform L2norm on the input data

    For axis=None, y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps))

    Parameters
    ----------
    data : tvm.Tensor
        4-D with NCHW or NHWC layout

    eps : float
        epsilon value

    axis : list of int
        axis over the normalization applied

    Returns
    -------
    output : tvm.Tensor
        4-D output with same shape
    """
    assert len(data.shape) == 4, "only support 4-dim lrn"
    dot_value = topi.cpp.pow(data, 2.0)
    sum_value = topi.sum(dot_value, axis=axis, keepdims=True)
    expand_sum = topi.broadcast_to(sum_value, data.shape)
    return topi.broadcast_div(data, topi.sqrt(\
                tvm.compute(expand_sum.shape, lambda i, j, k, l:\
                tvm.max(expand_sum[i, j, k, l], eps), tag='l2norm')))
    def check_cuda(dtype, m=32, n=32):
        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
            print("skip because cuda is not enabled..")
            return
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        a = te.placeholder((m, n), name="a", dtype=dtype)
        b = te.placeholder((m, n), name="b", dtype=dtype)
        c = a + b
        d = a * b
        e = topi.elemwise_sum([c, d])
        g = topi.sum(e)
        with tvm.target.cuda():
            sg = topi.cuda.schedule_reduce(g)
            ctx = tvm.gpu(0)
            func = tvm.build(sg, [a, b, g], 'cuda')
            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
            b_np = np.random.uniform(size=(m, n)).astype(b.dtype)
            g_np = np.sum(np.add(a_np * b_np, a_np + b_np))
            a_nd = tvm.nd.array(a_np, ctx)
            b_nd = tvm.nd.array(b_np, ctx)
            g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
            func(a_nd, b_nd, g_nd)
            tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
Ejemplo n.º 6
0
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
    # Build the logic and compile the function
    dat_dtype = "float32"
    A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
    A1 = topi.sqrt(topi.exp(A))
    out_dtype = "float32"
    if type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif type == "max":
        B = topi.max(A1, axis=axis, keepdims=keepdims)
    elif type == "min":
        B = topi.min(A1, axis=axis, keepdims=keepdims)
    elif type == "argmax":
        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    elif type == "argmin":
        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    else:
        raise NotImplementedError

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        with tvm.target.create(device):
            s = topi.generic.schedule_reduce(B)
        ctx = tvm.context(device, 0)
        foo = tvm.build(s, [A, B], device, name="sum")
        # Test
        in_npy = np.random.uniform(size=in_shape).astype(np.float32)
        in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
        if type == "sum":
            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
        elif type == "max":
            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
        elif type == "min":
            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
        elif type == "argmax":
            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
        elif type == "argmin":
            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
        else:
            raise NotImplementedError
        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
        for _ in range(1):
            foo(data_tvm, out_tvm)
        np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)

    check_device("opencl")
    check_device("cuda")
    check_device("metal")
    check_device("rocm")
def make_matrix_softmax_cross_entropy(shape,
                                      tgt,
                                      tgt_host,
                                      func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""

    # softmax
    y = tvm.te.placeholder(shape, dtype=dtype, name="y")  # input y
    maxtrix_row_max = topi.max(y, axis=1, keepdims=False)
    Ex = tvm.te.compute(shape,
                        lambda i, j: tvm.te.exp(y[i][j] - maxtrix_row_max[i]),
                        name="exp_element")
    Ex_sum = topi.sum(Ex, axis=1, keepdims=False)

    soft_max = tvm.te.compute(shape,
                              lambda i, j: Ex[i][j] / Ex_sum[i],
                              name="soft_max")

    # cross_entropy
    y_real = tvm.te.placeholder(shape, dtype=dtype, name="y_real")
    j = tvm.te.reduce_axis((0, shape[1]), name="j")
    loss = tvm.te.compute(
        (shape[0], ),
        lambda i: tvm.te.sum(y_real[i][j] * tvm.te.log(soft_max[i][j]), j),
        name="loss")

    sum_loss = topi.sum(loss, axis=0, keepdims=True)
    mean_loss = tvm.te.compute((1, ), lambda *i: -1 * sum_loss(*i) / shape[0],
                               "mean_loss")

    s = tvm.te.create_schedule(mean_loss.op)
    f = tvm.build(s, [y, y_real, mean_loss],
                  tgt,
                  target_host=tgt_host,
                  name=func_name)
    # print(tvm.lower(s, [y, y_real, mean_loss],name=func_name, simple_mode=True))
    return f
Ejemplo n.º 8
0
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"):
    A = te.placeholder(shape, dtype=dtype, name="A")
    C = topi.sum(A, axis=0, keepdims=False)

    s = te.create_schedule(C.op)

    if tgt=="cuda":
        bx,tx=s[C].split(C.op.axis[1],factor=32)
        s[C].bind(bx,te.thread_axis("blockIdx.x"))
        s[C].bind(tx,te.thread_axis("threadIdx.x"))
        # print(tvm.lower(s, [A, C], simple_mode=True))


    f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name)
    return f
Ejemplo n.º 9
0
def make_reduce_sum_axis_zero(shape,
                              tgt,
                              tgt_host,
                              func_name,
                              dtype="float32"):
    A = tvm.placeholder(shape, dtype=dtype, name="A")
    C = topi.sum(A, axis=0, keepdims=False)

    s = tvm.create_schedule(C.op)

    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")

    print(C.op.axis, C)
    # s[C].bind(C.op.axis[0], block_x)
    s[C].bind(C.op.axis[0], thread_x)

    f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name)
    return _export_module(f, func_name, remote)
Ejemplo n.º 10
0
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A")
    if type == "sum":
        B = topi.sum(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        B = topi.max(A, axis=axis, keepdims=keepdims)
    elif type == "min":
        B = topi.min(A, axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_reduce(B)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
        foo = tvm.build(s, [A, B], device, name="sum")

        # Test
        in_npy = np.random.normal(size=in_shape).astype(np.float32)
        if type == "sum":
            out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
        elif type == "max":
            out_npy = in_npy.max(axis=axis, keepdims=keepdims)
        elif type == "min":
            out_npy = in_npy.min(axis=axis, keepdims=keepdims)
        else:
            raise NotImplementedError

        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx)
        for _ in range(1):
            foo(data_tvm, out_tvm)
        np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)

    check_device("opencl")
    check_device("cuda")
    check_device("metal")
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
    global TASK
    # Build the logic and compile the function
    A = te.placeholder(shape=in_shape, name="A")
    if type == "sum":
        TASK = "sum_map_id%d" % test_id
        B = topi.sum(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        TASK = "max_map_id%d" % test_id
        B = topi.max(A, axis=axis, keepdims=keepdims)
    elif type == "min":
        TASK = "min_map_id%d" % test_id
        B = topi.min(A, axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_reduce(B)
    with tvm.target.build_config(auto_unroll_max_step=16,
                                 auto_unroll_min_depth=0):
        fcuda = tvm.build(s, [A, B], "cuda", name="sum")

    # Test
    in_npy = np.random.normal(size=in_shape).astype(np.float32)
    if type == "sum":
        out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
    elif type == "max":
        out_npy = in_npy.max(axis=axis, keepdims=keepdims)
    elif type == "min":
        out_npy = in_npy.min(axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError

    data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
    out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())

    for _ in range(2):
        fcuda(data_tvm, out_tvm)
    tvm.testing.assert_allclose(out_tvm.asnumpy(),
                                out_npy,
                                rtol=4e-4,
                                atol=4e-4)
Ejemplo n.º 12
0
    def check(device, dtype, m=32, n=32):
        ctx = tvm.context(device, 0)
        if not ctx.exist or not tvm.runtime.enabled(device):
            print("skip because", device, "is not enabled..")
            return
        if dtype == "float16" and not have_fp16(ctx.compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        a = tvm.te.placeholder((m, n), name="a", dtype=dtype)
        b = topi.sum(a)
        with tvm.target.create(device):
            sb = tvm.te.create_schedule(b.op)
            i, _ = b.op.reduce_axis
            sb[b].bind(i, tvm.te.thread_axis("threadIdx.x"))
            func = tvm.build(sb, [a, b], device)
            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
            b_np = np.sum(a_np)
            a_nd = tvm.nd.array(a_np, ctx)
            b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx)
            func(a_nd, b_nd)
            tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
Ejemplo n.º 13
0
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
    global TASK
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A")
    if type == "sum":
        TASK = "sum_map_id%d" %test_id
        B = topi.sum(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        TASK = "max_map_id%d" %test_id
        B = topi.max(A, axis=axis, keepdims=keepdims)
    elif type == "min":
        TASK = "min_map_id%d" %test_id
        B = topi.min(A, axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_reduce(B)
    with tvm.build_config(auto_unroll_max_step=16,
                          auto_unroll_min_depth=0):
        fcuda = tvm.build(s, [A, B], "cuda", name="sum")

    # Test
    in_npy = np.random.normal(size=in_shape).astype(np.float32)
    if type == "sum":
        out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
    elif type == "max":
        out_npy = in_npy.max(axis=axis, keepdims=keepdims)
    elif type == "min":
        out_npy = in_npy.min(axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError

    data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
    out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())

    for _ in range(2):
        fcuda(data_tvm, out_tvm)
    tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
Ejemplo n.º 14
0
A = tvm.placeholder((n, m), name='A')
k = tvm.reduce_axis((0, m), "k")
B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
s = tvm.create_schedule(B.op)

######################################################################
# and to examine the IR code in human readable format, we can do
#
print(tvm.lower(s, [A], simple_mode=True))

######################################################################
# However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
# :code: `tvm.compute`. Imagine for more complicated operations how much details we need to provide.
# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code`numpy.sum`
#
C = topi.sum(A, axis=1)
ts = tvm.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))

######################################################################
# Numpy-style operator overloading
# --------------------------------
# We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes.
# Even shorter, TOPI provides operator overloading for such common operations. For example,
#
x, y = 100, 10
a = tvm.placeholder((x, y, y), name="a")
b = tvm.placeholder((y, y), name="b")
c = a + b  # same as topi.broadcast_add
d = a * b  # same as topi.broadcast_mul
Ejemplo n.º 15
0
A = tvm.placeholder((n, m), name='A')
k = tvm.reduce_axis((0, m), "k")
B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
s = tvm.create_schedule(B.op)

######################################################################
# and to examine the IR code in human readable format, we can do
#
print(tvm.lower(s, [A], simple_mode=True))

######################################################################
# However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
# :code:`tvm.compute`. Imagine for more complicated operations how much details we need to provide.
# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code:`numpy.sum`
#
C = topi.sum(A, axis=1)
ts = tvm.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))

######################################################################
# Numpy-style operator overloading
# --------------------------------
# We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes.
# Even shorter, TOPI provides operator overloading for such common operations. For example,
#
x, y = 100, 10
a = tvm.placeholder((x, y, y), name="a")
b = tvm.placeholder((y, y), name="b")
c = a + b  # same as topi.broadcast_add
d = a * b  # same as topi.broadcast_mul
Ejemplo n.º 16
0
def verify_reduce_map_ele(in_shape,
                          axis,
                          keepdims,
                          type="sum",
                          dtype="float32"):
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype)
    A1 = topi.sqrt(topi.exp(A))
    out_dtype = dtype
    if type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif type == "all":
        B = topi.all(A, axis=axis, keepdims=keepdims)
    elif type == "any":
        B = topi.any(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        B = topi.max(A1, axis=axis, keepdims=keepdims)
    elif type == "min":
        B = topi.min(A1, axis=axis, keepdims=keepdims)
    elif type == "argmax":
        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    elif type == "argmin":
        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    else:
        raise NotImplementedError

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_reduce(B)

        foo = tvm.build(s, [A, B], device, name=type)
        # Test
        if dtype == 'bool':
            in_npy_map = in_npy = np.random.choice([True, False],
                                                   size=in_shape)
        else:
            in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
            in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)

        if type == "sum":
            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
        elif type == "all" and dtype == 'bool':
            out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
        elif type == "any" and dtype == "bool":
            out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
        elif type == "max":
            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
        elif type == "min":
            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
        elif type == "argmax":
            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
        elif type == "argmin":
            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
        else:
            raise NotImplementedError
        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
        for _ in range(1):
            foo(data_tvm, out_tvm)
        if type == "argmax" or type == "argmin":
            out_tvm_indices = out_tvm.asnumpy()
            if keepdims:
                out_tvm_indices = np.take(out_tvm_indices,
                                          indices=0,
                                          axis=axis)
            if axis is None:
                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
            else:
                other_indices = tuple(
                    np.indices(in_shape[0:axis] + in_shape[(axis + 1):]))
                sel_indices = other_indices[0:axis] + (
                    out_tvm_indices, ) + other_indices[axis:]
                out_tvm_val = in_npy_map[sel_indices]
            if type == "argmax":
                tvm.testing.assert_allclose(out_tvm_val,
                                            in_npy_map.max(axis=axis), 1E-3,
                                            1E-3)
            elif type == "argmin":
                tvm.testing.assert_allclose(out_tvm_val,
                                            in_npy_map.min(axis=axis), 1E-3,
                                            1E-3)
        else:
            tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)

    for device in get_all_backend():
        check_device(device)
Ejemplo n.º 17
0
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
    # Build the logic and compile the function
    dat_dtype = "float32"
    A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
    A1 = topi.sqrt(topi.exp(A))
    out_dtype = "float32"
    if type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif type == "max":
        B = topi.max(A1, axis=axis, keepdims=keepdims)
    elif type == "min":
        B = topi.min(A1, axis=axis, keepdims=keepdims)
    elif type == "argmax":
        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    elif type == "argmin":
        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    else:
        raise NotImplementedError

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_reduce(B)

        foo = tvm.build(s, [A, B], device, name=type)
        # Test
        in_npy = np.random.uniform(size=in_shape).astype(np.float32)
        in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
        if type == "sum":
            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
        elif type == "max":
            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
        elif type == "min":
            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
        elif type == "argmax":
            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
        elif type == "argmin":
            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
        else:
            raise NotImplementedError
        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
        for _ in range(1):
            foo(data_tvm, out_tvm)
        if type == "argmax" or type == "argmin":
            out_tvm_indices = out_tvm.asnumpy()
            if keepdims:
                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
            if axis is None:
                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
            else:
                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):]))
                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
                out_tvm_val = in_npy_map[sel_indices]
            if type == "argmax":
                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
            elif type == "argmin":
                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
        else:
            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan"]:
        check_device(device)
Ejemplo n.º 18
0
for i in range(num_timesteps):
    inp = topi.concatenate([xs[i], new_h], 1)
    g = topi.tanh(topi.matmul(inp, weights[0]) + weights[1])
    j = topi.sigmoid(topi.matmul(inp, weights[2]) + weights[3])
    f = topi.sigmoid(topi.matmul(inp, weights[4]) + weights[5])
    o = topi.sigmoid(topi.matmul(inp, weights[6]) + weights[7])

    new_s = new_s * f + g * j
    new_h = topi.tanh(new_s) * o

logits = topi.matmul(new_h, weights[8]) + weights[9]

# compute accuracy
pred = topi.nn.softmax(logits)
correct_pred = topi.equal(topi.argmax(y, 1), topi.argmax(pred, 1))
accuracy = topi.sum(correct_pred.astype('float32')) / batch_size

# Define loss and optimizer
loss = topi.sum(-topi.sum(y *
                          topi.nn.log_softmax(logits), axis=1)) / batch_size

head = topi.full((1, ), 'float32', 1.0)
gradients = list(tvm.differentiate(topi.reshape(loss, (1, )), weights, head))
new_weights = [w - lr * g for (w, g) in zip(weights, gradients)]

# Define model
sched = tvm.create_schedule([loss.op, accuracy.op] +
                            [x.op for x in new_weights])
parallel_schedule(sched)
train_model = tvm.build(sched,
                        [x, y, s, h, loss, accuracy, *weights, *new_weights])
Ejemplo n.º 19
0
def demo_conv2d():
    lrate = 0.1
    nbatches = 100  # batches to train

    num_classes = 10
    batch_size = 10
    img_h = 28
    img_w = 28
    img_c = 1

    f1_c = 4
    f2_c = 5
    f3_units = 16

    x = tvm.placeholder((batch_size, img_h, img_w, img_c), name='x')
    y = tvm.placeholder((batch_size, num_classes), name='y')

    print('Block1')
    w1 = tvm.placeholder((3, 3, img_c, f1_c), name='w1')
    b1 = tvm.placeholder((f1_c, ), name='b1')
    t = topi.nn.conv2d(x, w1, 1, 0, layout='NHWC', out_dtype=tvm.float32)
    t = t + topi.broadcast_to(b1, (batch_size, 1, 1, f1_c))
    print('Block1: after-biasing shape is', get_shape(t))
    t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max', layout='NHWC')
    print('Block1: after-pooling shape is', get_shape(t))
    t = topi.nn.relu(t)
    print('Block1: after-relu shape is', get_shape(t))

    print('Block2')
    w2 = tvm.placeholder((3, 3, f1_c, f2_c), name='w2')
    b2 = tvm.placeholder((f2_c, ), name='b2')
    t = topi.nn.conv2d(t, w2, 1, 0, layout='NHWC', out_dtype=tvm.float32)
    t = t + topi.broadcast_to(b2, (batch_size, 1, 1, f2_c))
    print('Block2: after-biasing shape is', get_shape(t))
    t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max', layout='NHWC')
    print('Block2: after-pooling shape is', get_shape(t))
    t = topi.nn.relu(t)
    print('Block2: after-relu shape is', get_shape(t))
    t = topi.nn.flatten(t)
    print('Block2: after-flattern shape is', get_shape(t))

    print('Block3')
    w3 = tvm.placeholder((f3_units, get_shape(t)[1]))
    b3 = tvm.placeholder((f3_units, ))
    t = topi.nn.dense(t, w3, b3)
    print('Block3: after-dense shape is', get_shape(t))

    print('Block4')
    w4 = tvm.placeholder((num_classes, get_shape(t)[1]))
    b4 = tvm.placeholder((num_classes, ))
    t = topi.nn.dense(t, w4, b4)
    print('Block4: after-dense shape is', get_shape(t))
    t = topi.nn.relu(t)

    p = topi.argmax(t, axis=1)
    # TODO: check the correctnesss of the log_softmax expression
    # TODO: figure out the difference between it and standard cross-entropy loss
    l = -topi.sum(y * topi.nn.log_softmax(t)) / batch_size

    print('Block4: loss shape is', get_shape(l))

    ones = topi.full_like(l, 1.0)
    #[dl_dw1,dl_db1,dl_dw2,dl_db2,dl_dw3,dl_db3,dl_dw4,dl_db4]
    params = [w1, b1, w2, b2, w3, b3, w4, b4]

    dl = list(tvm.ir_pass.JacobianRecursive(l, params, ones))
    assert len(params) == len(dl)
    print('dl_dw1 weight is', get_shape(params[0]))

    sdl = tvm.create_schedule([p.op for p in [x, y, l] + params + dl])
    mdl = tvm.build(sdl, [x, y, l] + params + dl)
    print('Train+Inference module', mdl)

    # sl = tvm.create_schedule([l.op])
    # ml = tvm.build(sdl, [x,y] + params + [l])
    # print('Inference module',ml)

    state = {}
    for p in params:
        state.update({
            p:
            tvm.nd.array(
                np.random.uniform(-1.0, 1.0,
                                  size=get_shape(p)).astype(np.float32))
        })

    grads = {}
    for p, g in zip(params, dl):
        grads.update({p: tvm.nd.empty(get_shape(g))})

    for ib in range(nbatches):
        b = range(ib * batch_size, (ib + 1) * batch_size)
        tx = tvm.nd.array(mnist_img(b))
        ty = tvm.nd.array(mnist_cls_oh(b))
        tl = tvm.nd.empty(shape=(), dtype=tvm.float32)

        print('Entering')
        mdl(*([tx, ty, tl] + list(state.values()) + list(grads.values())))
        print('Done', 'loss', tl.asnumpy())

        state2 = {}
        for p in params:
            state2.update({
                p:
                tvm.nd.array(state[p].asnumpy() - lrate * grads[p].asnumpy())
            })

        state = state2
Ejemplo n.º 20
0
def compute_cross_entropy(attrs, inputs, out_dtype):
    x, y = inputs
    return [-topi.sum(topi.log(x) * y) / x.shape[0]]
Ejemplo n.º 21
0
def compute_cross_entropy_with_logits(attrs, inputs, out_dtype):
    x, y = inputs
    return [-topi.sum(x * y) / x.shape[0]]
import tvm
import topi

x = tvm.te.placeholder((32, 3, 28, 28), name='x')
w1 = tvm.te.placeholder((10, 3, 3, 3), name='w1')
w2 = tvm.te.placeholder((10, 10, 3, 3), name='w2')
z1 = topi.nn.conv2d(x, w1, 1, 1, 1)
z2 = topi.nn.conv2d(z1, w2, 1, 1, 1)
y = topi.sum(z2)

# produce gradients
[dw1] = tvm.te.gradient(y, [w1])

print(type(dw1))

# produce Jacobians
[jw1, jw2] = tvm.te.gradient(z2, [w1, w2])

# produce gradients, the head adjoint for z2 is provided manually
[dw1, dw2] = tvm.te.gradient(z2, [w1, w2], topi.full_like(z2, 1.0))
Ejemplo n.º 23
0
from __future__ import absolute_import, print_function
import tvm
import topi
import numpy as np

if __name__ == '__main__':
    x, y = 100, 10
    a = tvm.placeholder((x, y, y), name='a')
    b = tvm.placeholder((y, y), name='b')
    c = a + b
    d = a * b

    e = topi.elemwise_sum([c, d])
    f = e / 2.0
    g = topi.sum(f)
    with tvm.target.cuda():
        sg = topi.generic.schedule_reduce(g)
        print(tvm.lower(sg, [a, b], simple_mode=True))
Ejemplo n.º 24
0
# https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation

import tvm
import topi
import numpy

x = tvm.te.placeholder((3, ), name='x')
w = tvm.te.placeholder((3, ), name='w')
z1 = topi.multiply(x, w)
z2 = topi.sum(z1)
z3 = topi.multiply(z2, -1)
z4 = topi.exp(z3)
z5 = topi.add(z4, 1)
z6 = topi.divide(1, z5)

[dw] = tvm.te.gradient(z6, w)
s = tvm.te.create_schedule(dw.op)
g = tvm.build(s, [x, w, dw])

# The default tensor type in tvm
dtype = "float32"
target = 'llvm'
ctx = tvm.context(target, 0)

# # Random generated tensor for testing
x1 = tvm.nd.array(numpy.array([1, 3, 2]).astype(dtype), ctx)
w1 = tvm.nd.array(numpy.array([2, 1, -2]).astype(dtype), ctx)
dw1 = tvm.nd.empty(shape=(3, ), dtype='float32', ctx=ctx)
g(x1, w1, dw1)
print("ret=", dw1)