Beispiel #1
0
def log_softmax(x, axis=-1):
    """Perform log softmax activation on the data

    Parameters
    ----------
    data : tvm.te.Tensor
        2-D input data

    Returns
    -------
    output : tvm.te.Tensor
        2-D output with same shape
    """
    assert len(x.shape) == 2, "only support 2-dim log softmax"
    # pylint: disable=R1714
    assert axis == -1 or axis == len(
        x.shape) - 1, "only support last axis log softmax"
    m, n = x.shape
    k = te.reduce_axis((0, n), name="k")
    max_elem = te.compute((m, ), lambda i: tvm.te.max(x[i, k], axis=k))
    k = te.reduce_axis((0, n), name="k")
    expsum = te.compute(
        (m, ), lambda i: te.sum(te.exp(x[i, k] - max_elem[i]), axis=k))
    return te.compute(x.shape,
                      lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]))
Beispiel #2
0
def test_exp():
    # graph
    n = tvm.runtime.convert(1024)
    A = te.placeholder((n, ), name="A")
    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B")
    s = te.create_schedule(B.op)
    # create iter var and assign them tags.
    num_thread = 8
    bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
    s[B].bind(bx, te.thread_axis("blockIdx.x"))
    s[B].bind(tx, te.thread_axis("threadIdx.x"))

    # one line to build the function.
    def check_device(device, host="stackvm"):
        if not tvm.testing.device_enabled(host):
            return
        ctx = tvm.context(device, 0)
        fexp = tvm.build(s, [A, B], device, host, name="myexp")
        ctx = tvm.context(device, 0)
        # launch the kernel.
        n = 1024
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
        fexp(a, b)
        tvm.testing.assert_allclose(b.asnumpy(),
                                    np.exp(a.asnumpy()),
                                    rtol=1e-5)

    check_device("opencl -device=intel_graphics")
    check_device("cuda", "llvm")
    check_device("vulkan")
def test_exp():
    # graph
    n = tvm.runtime.convert(1024)
    A = te.placeholder((n, ), name="A")
    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B")
    s = te.create_schedule(B.op)
    # create iter var and assign them tags.
    px, x = s[B].split(B.op.axis[0], nparts=1)
    s[B].bind(px, te.thread_axis("pipeline"))

    # one line to build the function.
    def check_device(device, host="llvm"):
        if not tvm.testing.device_enabled(device):
            return
        dev = tvm.device(device, 0)
        fexp = tvm.build(s, [A, B], device, host, name="myexp")
        dev = tvm.device(device, 0)
        # launch the kernel.
        n = 1024
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
        fexp(a, b)
        tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5)

    check_device("sdaccel")
    if "AWS_PLATFORM" in os.environ:
        check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))

    check_device("aocl_sw_emu")
Beispiel #4
0
def reg_bbox(x1, y1, x2, y2, dx, dy, dw, dh):
    """Bounding box regression function"""
    bbox_w = x2 - x1 + 1.0
    bbox_h = y2 - y1 + 1.0
    ctr_x = x1 + 0.5 * (bbox_w - 1.0)
    ctr_y = y1 + 0.5 * (bbox_h - 1.0)

    pred_ctr_x = dx * bbox_w + ctr_x
    pred_ctr_y = dy * bbox_h + ctr_y
    pred_w = te.exp(dw) * bbox_w
    pred_h = te.exp(dh) * bbox_h

    pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0)
    pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0)
    pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0)
    pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0)
    return pred_x1, pred_y1, pred_x2, pred_y2
def test_inline2():
    m = te.size_var('m')
    A = te.placeholder((m, ), name='A')
    T = te.compute((m, ), lambda i, : A[i] + 10, name='T')
    stmt = tvm.tir.Evaluate(te.exp(T[10]) + 11 * T[100])
    stmt = tvm.tir.ir_pass.Inline(stmt, T.op, [x.var for x in T.op.axis],
                                  T.op.body[0])

    def check(op):
        if isinstance(op, tvm.tir.Call):
            assert op.func != T.op

    tvm.tir.ir_pass.PostOrderVisit(stmt, check)
Beispiel #6
0
def exp(x):
    """Take exponential of input x.

    Parameters
    ----------
    x : tvm.te.Tensor
        Input argument.

    Returns
    -------
    y : tvm.te.Tensor
        The result.
    """
    return te.compute(x.shape, lambda *i: te.exp(x(*i)))
Beispiel #7
0
def test_exp():
    """Test scheduling and running exponent."""
    # graph
    arr_length = 1024
    arr_length_tvm = tvm.runtime.convert(arr_length)
    placeholder_a = te.placeholder((arr_length_tvm, ), name="A")
    placeholder_b = te.compute(placeholder_a.shape,
                               lambda *i: te.exp(placeholder_a(*i)),
                               name="B")
    schedule = te.create_schedule(placeholder_b.op)
    # create iter var and assign them tags.
    num_thread = 8
    axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0],
                                                 factor=num_thread)
    schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
    schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))

    # one line to build the function.
    def check_device(device, host="stackvm"):
        if not tvm.testing.device_enabled(host):
            return
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return
        fexp = tvm.build(schedule, [placeholder_a, placeholder_b],
                         device,
                         host,
                         name="myexp")
        dev = tvm.device(device, 0)
        # launch the kernel.
        buff_a = tvm.nd.array(
            np.random.uniform(size=arr_length).astype(placeholder_a.dtype),
            dev)
        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=placeholder_b.dtype),
                              dev)
        fexp(buff_a, buff_b)
        tvm.testing.assert_allclose(buff_b.numpy(),
                                    np.exp(buff_a.numpy()),
                                    rtol=1e-5)

    check_device("opencl -device=intel_graphics")
    check_device("cuda", "llvm")
    check_device("vulkan")
Beispiel #8
0
def test_exp():
    """Test scheduling and running exp function."""
    # graph
    arr_length = 1024
    arr_length_tvm = tvm.runtime.convert(arr_length)
    placeholder_b = te.placeholder((arr_length_tvm, ), name="A")
    result_b = te.compute(placeholder_b.shape,
                          lambda *i: te.exp(placeholder_b(*i)),
                          name="B")
    schedule = te.create_schedule(result_b.op)
    # create iter var and assign them tags.
    axis1, _ = schedule[result_b].split(result_b.op.axis[0], nparts=1)
    schedule[result_b].bind(axis1, te.thread_axis("pipeline"))

    # one line to build the function.
    def check_device(device, host="llvm"):
        if not tvm.testing.device_enabled(device):
            return
        dev = tvm.device(device, 0)
        fexp = tvm.build(schedule, [placeholder_b, result_b],
                         device,
                         host,
                         name="myexp")
        dev = tvm.device(device, 0)
        # launch the kernel.
        buff_a = tvm.nd.array(
            np.random.uniform(size=arr_length).astype(placeholder_b.dtype),
            dev)
        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=result_b.dtype), dev)
        fexp(buff_a, buff_b)
        tvm.testing.assert_allclose(buff_b.numpy(),
                                    np.exp(buff_a.numpy()),
                                    rtol=1e-5)

    check_device("sdaccel")
    if "AWS_PLATFORM" in os.environ:
        check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))

    check_device("aocl_sw_emu")
Beispiel #9
0
def test_inline_multi_reduce():
    def argmax_comp(x, y):
        idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return idx, val

    def argmax_init(idx_typ, val_typ):
        return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ)

    argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax")
    m = te.var("m")
    n = te.var("n")
    val = te.placeholder((m, n), name="val", dtype="float32")
    val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1")
    val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2")
    k = te.reduce_axis((0, n), "k")
    T_idx, T_val = te.compute((m, ),
                              lambda i: argmax((k.var, val2[i, k]), axis=k),
                              name="T")
    s = te.create_schedule(T_idx.op)
    s[val1].compute_inline()
    s = s.normalize()
    bounds = tvm.te.schedule.InferBound(s)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
Beispiel #10
0
 def _compute_exp(max_elem, *indices):
     non_reduce_indices = get_non_reduce_indices(indices)
     return te.exp(x[indices] - max_elem[non_reduce_indices])
Beispiel #11
0
 def _compute_expsum(max_elem, *indices):
     eval_range = insert_reduce_index(indices, k2)
     return te.sum(te.exp(x[eval_range] - max_elem[indices]), axis=k2)
Beispiel #12
0
 def func2():
     return te.exp(tvm.tir.truncdiv((x + y + 1) * y, 4))
def test_basic_operation():
    np.random.seed(0)
    shape = (10, 10)
    x = te.var("x", dtype='float32')
    k = te.reduce_axis((0, 10), name="k")
    l = te.reduce_axis((0, 10), name="l")
    A0 = te.placeholder(shape, name='A0')
    A1 = te.placeholder(shape, name='A1')
    zeros = np.zeros(shape)

    B = te.compute(shape, lambda i, j: A0[i, j], name='B')
    check_grad(B, [A0])

    B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name='B')
    check_grad(B, A0)

    B = te.compute(
        shape,
        lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))),
        name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(0.1, 10))

    B = te.compute(shape,
                   lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(-4, 4))

    B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute((10, ),
                   lambda i: te.sum(A0[i, k] * A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]),
                   name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape,
                   lambda i, j: te.sum(
                       A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k),
                   name='B')
    check_grad(B, A0)

    def fcombine(x, y):
        return x * y

    def fidentity(t0):
        return tvm.tir.const(1, t0)

    prod = te.comm_reducer(fcombine, fidentity, name='prod')
    B = te.compute((10, 10),
                   lambda i, j: prod(A0[i, k] + A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    X = te.placeholder((10, ), name='X')
    A = te.compute((10, ), lambda i: X[i] + X[9 - i])
    B = te.compute((10, ), lambda i: X[i] * X[9 - i])
    Y = topi.tensordot(A, B, 1)
    check_grad(Y, X)
Beispiel #14
0
######################################################################
# Unified Intrinsic Call
# ----------------------
# The above code verifies that direct external call can be used to
# call into device specific functions.
# However, the above way only works for CUDA target with float type.
# Ideally, we want to write same code for any device and any data type.
#
# TVM intrinsic provides the user a mechanism to achieve this, and this
# is the recommended way to solve the problem.
# The following code use te.exp instead, which create an intrinsic call
# :py::func:`tvm.te.exp` to do the exponential.
#
n = te.var("n")
A = te.placeholder((n, ), name="A")
B = te.compute(A.shape, lambda i: te.exp(A[i]), name="B")
s = te.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
fcuda = tvm.build(s, [A, B], "cuda", name="myexp")
print(fcuda.imported_modules[0].get_source())
######################################################################
# We can find that the code works for both CUDA and opencl.
# The same te.exp can also be used for float64 data types.
#
fopencl = tvm.build(s, [A, B], "opencl", name="myexp")
print(fopencl.imported_modules[0].get_source())

######################################################################
Beispiel #15
0
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""
    A_=te.placeholder(shape,dtype=dtype,name="A_")
    A=te.placeholder(shape,dtype=dtype,name="A")

    #desined by myself
    k = te.reduce_axis((0, A.shape[1]), name="k")
    A_max = te.compute((A.shape[0],), lambda i: te.max(A[i, k], axis=k))
    A_ex = te.compute(shape, lambda i, j: te.exp(A[i, j] - A_max[i]))
    k1 = te.reduce_axis((0, A.shape[1]), name="k1")
    A_ex_sum = te.compute((A.shape[0],), lambda i: te.sum(A_ex[i, k1], axis=k1))
    A_logsoftmax = te.compute(shape, lambda i, j: te.log(A_ex[i, j] / A_ex_sum[i]))

    k2=te.reduce_axis((0,shape[1]),name="k2")
    A_logsoftmax_sum=te.compute((shape[0],0),lambda i:te.sum(A_logsoftmax[i,k2]*A_[i,k2],axis=k2))
    k3=te.reduce_axis((0,shape[0]),name="k3")
    B=te.compute((1,),lambda i: te.sum(-A_logsoftmax_sum[k3],axis = k3))
    B1=te.compute((1,), lambda i: B[i] / shape[0])

    s=te.create_schedule(B1.op)
    if tgt=="cuda":
        #I'dont know why it can't work?
        s = te.create_schedule(B1.op)

        num_thread = 64
        block_x = te.thread_axis("blockIdx.x")
        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")

        s[A_ex].bind(A_ex.op.axis[0], block_x)
        s[A_max].bind(A_max.op.axis[0], block_x)

        k_ex_sum = A_ex_sum.op.reduce_axis[0]
        ko, ki = s[A_ex_sum].split(k_ex_sum, factor=num_thread)
        EF = s.rfactor(A_ex_sum, ki)
        s[A_ex_sum].bind(s[A_ex_sum].op.axis[0], block_x)
        s[A_ex_sum].bind(s[A_ex_sum].op.reduce_axis[0], thread_x)
        s[EF].compute_at(s[A_ex_sum], s[A_ex_sum].op.reduce_axis[0])
        s[A_ex_sum].set_store_predicate(thread_x.var.equal(0))

        tx, xi = s[A_logsoftmax].split(A_logsoftmax.op.axis[1], nparts=num_thread)
        s[A_logsoftmax].bind(A_logsoftmax.op.axis[0], block_x)
        s[A_logsoftmax].bind(tx, thread_x)

        k_logsoftmax_sum = A_logsoftmax_sum.op.reduce_axis[0]
        klso, klsi = s[A_logsoftmax_sum].split(k_logsoftmax_sum, factor=num_thread)
        lsEF = s.rfactor(A_logsoftmax_sum, klsi)
        s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.axis[0], block_x)
        s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.reduce_axis[0], thread_x)
        s[lsEF].compute_at(s[A_logsoftmax_sum], s[A_logsoftmax_sum].op.reduce_axis[0])
        s[A_logsoftmax_sum].set_store_predicate(thread_x.var.equal(0))

        k_B=B.op.reduce_axis[0]
        kbo,kbi=s[B].split(k_B,factor=num_thread)
        bEF=s.rfactor(B,kbi)
        s[B].bind(s[B].op.reduce_axis[0],thread_x)
        s[bEF].compute_at(s[B],s[B].op.reduce_axis[0])
        s[B].set_store_predicate(block_x.var.equal(0))

        s[B1].set_store_predicate(block_x.var.equal(0))


        print(tvm.lower(s, [A, A_,B1], simple_mode=True))


    f=tvm.build(s,[A,A_,B1],tgt,tgt_host,name=func_name)
    return f