Ejemplo n.º 1
0
def test_tensor_comm_reducer():
    m = te.size_var("m")
    n = te.size_var("n")
    A = te.placeholder((m, n), name="A")
    k = te.reduce_axis((0, n), "k")
    mysum = te.comm_reducer(lambda x, y: x + y, lambda t: tvm.tir.const(0, dtype=t))
    C = te.compute((m,), lambda i: mysum(A[i, k], axis=k))
Ejemplo n.º 2
0
def test_rfactor_argmax():
    def fcombine(x, y):
        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return lhs, rhs

    def fidentity(t0, t1):
        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)

    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")

    nn = 1027
    mm = 10
    n = tvm.runtime.convert(nn)
    m = tvm.runtime.convert(mm)
    A0 = te.placeholder((m, n), name="A0", dtype="int32")
    A1 = te.placeholder((m, n), name="A1", dtype="float32")
    k = te.reduce_axis((0, n))
    B0, B1 = te.compute((m, ),
                        lambda i: argmax((A0[i, k], A1[i, k]), axis=k),
                        name="B")

    # schedule
    s = te.create_schedule(B0.op)
    nthread = 16
    ko, kf = s[B0].split(k, factor=nthread)
    BF0, BF1 = s.rfactor(B0, kf)
    bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread)
    s[B0].bind(bx, te.thread_axis("blockIdx.x"))
    s[B0].bind(ty, te.thread_axis("threadIdx.y"))
    tx = s[B0].op.reduce_axis[0]
    thread_x = te.thread_axis("threadIdx.x")
    s[B0].bind(tx, thread_x)
    s[BF0.op].compute_at(s[B0], tx)
    s[B0].set_store_predicate(thread_x.var.equal(0))

    def check_target(device):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return
        fapi = tvm.lower(s, args=[A0, A1, B0, B1])
        fargmax = tvm.build(fapi, target=device, name="argmax")

        np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn),
                           mm,
                           axis=0)
        np_val = np.random.uniform(size=(mm, nn)).astype("float32")
        np_res = np.argmax(np_val, axis=1)

        nd_idx = tvm.nd.array(np_idx, dev)
        nd_val = tvm.nd.array(np_val, dev)
        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
        tvm.testing.assert_allclose(np_res, nd_res0.numpy())

    check_target("cuda")
    check_target("vulkan")
    check_target("rocm")
Ejemplo n.º 3
0
def test_warp_reduction2():
    def fcombine(x, y):
        return x[0] + y[0], x[1] * y[1]

    def fidentity(t0, t1):
        return tvm.tir.const(0, t0), tvm.tir.const(1, t1)

    add_mul_reducer = te.comm_reducer(fcombine,
                                      fidentity,
                                      name="add_mul_reducer")

    # compute
    m = 16
    n = 256
    A0 = te.placeholder((m, n), name="A0", dtype="float32")
    A1 = te.placeholder((m, n), name="Al", dtype="float32")
    k = te.reduce_axis((0, n), "k")
    T0, T1 = te.compute((m, ),
                        lambda i: add_mul_reducer(
                            (A0[i, k], A1[i, k]), axis=k),
                        name="T")

    nthdx, nthdy = 32, 2
    block_x = te.thread_axis("blockIdx.x")
    thread_x = te.thread_axis((0, nthdx), "threadIdx.x")
    thread_y = te.thread_axis((0, nthdy), "threadIdx.y")

    def check_target(device):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        # schedule
        s = te.create_schedule(T0.op)
        ko, _ = s[T0].split(k, nparts=nthdx)
        xo, xi = s[T0].split(s[T0].op.axis[0], factor=nthdy)
        s[T0].bind(ko, thread_x)
        s[T0].bind(xi, thread_y)
        s[T0].bind(xo, block_x)

        # validation
        dev = tvm.device(device, 0)
        a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype)
        a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype)
        t0_np = np.zeros((m, ), dtype=A0.dtype)
        t1_np = np.zeros((m, ), dtype=A1.dtype)
        a0 = tvm.nd.array(a0_np, dev)
        a1 = tvm.nd.array(a1_np, dev)
        t0 = tvm.nd.array(t0_np, dev)
        t1 = tvm.nd.array(t1_np, dev)
        func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction")
        func(a0, a1, t0, t1)
        t0_np = np.sum(a0_np, axis=1)
        t1_np = np.product(a1_np, axis=1)
        tvm.testing.assert_allclose(t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
        tvm.testing.assert_allclose(t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)

    check_target("cuda")
    check_target("rocm")
Ejemplo n.º 4
0
def common_reduce(name, args=(0,)):
  if not isinstance(args, tuple) and not isinstance(args, list):
    args = (args, )
  def reduce_op(x, y):
    assert x.dtype == y.dtype , "Reduing elements that don't have same data type: %s v.s. %s" % (x.dtype, y.dtype)
    return tir.call_pure_extern(x.dtype, name, x, y, *args[1:])
  return te.comm_reducer(reduce_op, lambda t: tir.const(args[0], dtype=t), name=name)
Ejemplo n.º 5
0
 def f(n):
     rv = te.reduce_axis((0, n))
     init = lambda dtype: tvm.tir.Select(n > 1, tvm.tir.const(0, dtype),
                                         n.astype(dtype))
     sum = te.comm_reducer(
         lambda x, y: tvm.te.max(x + y, n.astype("float32")),
         init,
         name="sum")
     return sum(X[rv], axis=rv)
Ejemplo n.º 6
0
def test_tensor_reduce_multiout_with_cond():
    def fcombine(x, y):
        return x[0] + y[0], x[1] + y[1]

    def fidentity(t0, t1):
        return tvm.tir.const(0, t0), tvm.tir.const(1, t1)

    mysum = te.comm_reducer(fcombine, fidentity, name="mysum")

    m = te.var("m")
    n = te.var("n")
    idx = te.placeholder((m, n), name="idx", dtype="int32")
    val = te.placeholder((m, n), name="val", dtype="int32")
    k = te.reduce_axis((0, n), "k")
    cond = te.floormod(k, 2) == 0
    T0, T1 = te.compute((m,), lambda i: mysum((idx[i, k], val[i, k]), axis=k, where=cond), name="T")
Ejemplo n.º 7
0
def test_argmax():
    def fcombine(x, y):
        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return lhs, rhs

    def fidentity(t0, t1):
        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)

    argmax = te.comm_reducer(fcombine, fidentity, name='argmax')
    m = te.size_var('m')
    n = te.size_var('n')
    idx = te.placeholder((m, n), name='idx', dtype='int32')
    val = te.placeholder((m, n), name='val', dtype='float32')
    k = te.reduce_axis((0, n), 'k')
    T0, T1 = te.compute((m, ),
                        lambda i: argmax((idx[i, k], val[i, k]), axis=k),
                        name='T')
    s = te.create_schedule(T0.op)

    def check_target():
        device = 'cpu'
        if not tvm.runtime.enabled(device):
            print("skip because %s is not enabled.." % device)
            return
        ctx = tvm.context(device, 0)
        fapi = tvm.lower(s, args=[idx, val, T0, T1])
        fargmax = tvm.build(fapi, target='llvm', name="argmax")

        mm = 12
        nn = 16
        np_idx = np.repeat(np.arange(nn, dtype='int32').reshape(1, nn),
                           mm,
                           axis=0)
        np_val = np.random.uniform(size=(mm, nn)).astype('float32')
        np_res = np.argmax(np_val, axis=1)

        nd_idx = tvm.nd.array(np_idx, ctx)
        nd_val = tvm.nd.array(np_val, ctx)
        nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx)
        nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx)
        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
        tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())

    check_target()
Ejemplo n.º 8
0
def test_argmax():
    def fcombine(x, y):
        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return lhs, rhs

    def fidentity(t0, t1):
        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)

    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
    m = te.size_var("m")
    n = te.size_var("n")
    idx = te.placeholder((m, n), name="idx", dtype="int32")
    val = te.placeholder((m, n), name="val", dtype="float32")
    k = te.reduce_axis((0, n), "k")
    T0, T1 = te.compute((m, ),
                        lambda i: argmax((idx[i, k], val[i, k]), axis=k),
                        name="T")
    s = te.create_schedule(T0.op)

    def check_target():
        device = "cpu"
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return
        dev = tvm.device(device, 0)
        fapi = tvm.lower(s, args=[idx, val, T0, T1])
        fargmax = tvm.build(fapi, target="llvm", name="argmax")

        mm = 12
        nn = 16
        np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn),
                           mm,
                           axis=0)
        np_val = np.random.uniform(size=(mm, nn)).astype("float32")
        np_res = np.argmax(np_val, axis=1)

        nd_idx = tvm.nd.array(np_idx, dev)
        nd_val = tvm.nd.array(np_val, dev)
        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
        tvm.testing.assert_allclose(np_res, nd_res0.numpy())

    check_target()
Ejemplo n.º 9
0
def test_argmax():
    """Test argmax."""

    def fcombine(tensor_x, tensor_y):
        lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0])
        rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1])
        return lhs, rhs

    def fidentity(tensor1, tensor2):
        return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2)

    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
    size_var_m = te.size_var("m")
    size_var_n = te.size_var("n")
    idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32")
    val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32")
    axis_k = te.reduce_axis((0, size_var_n), "k")
    result_t0, result_t1 = te.compute(
        (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T"
    )
    schedule = te.create_schedule(result_t0.op)

    def check_target():
        device = "cpu"
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return
        dev = tvm.device(device, 0)
        fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1])
        fargmax = tvm.build(fapi, target="llvm", name="argmax")

        height = 12
        width = 16
        np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0)
        np_val = np.random.uniform(size=(height, width)).astype("float32")
        np_res = np.argmax(np_val, axis=1)

        nd_idx = tvm.nd.array(np_idx, dev)
        nd_val = tvm.nd.array(np_val, dev)
        nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev)
        nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev)
        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
        tvm.testing.assert_allclose(np_res, nd_res0.numpy())

    check_target()
Ejemplo n.º 10
0
    def _pool(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype("int32")
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
            3], roi[4]

        roi_start_h = te.round(roi_start_h * spatial_scale).astype("int32")
        roi_start_w = te.round(roi_start_w * spatial_scale).astype("int32")
        roi_end_h = te.round(roi_end_h * spatial_scale).astype("int32")
        roi_end_w = te.round(roi_end_w * spatial_scale).astype("int32")

        # force malformed ROIs to be 1x1
        roi_h = tvm.te.max(roi_end_h - roi_start_h + 1,
                           tvm.tir.const(1, "int32"))
        roi_w = tvm.te.max(roi_end_w - roi_start_w + 1,
                           tvm.tir.const(1, "int32"))

        bin_h = roi_h.astype(dtype) / pooled_size_h
        bin_w = roi_w.astype(dtype) / pooled_size_w

        # use epsilon to prevent floating point precision loss in floor/ceil
        epsilon = tvm.tir.const(0.00001, dtype)
        hstart = te.floor(ph * bin_h + epsilon).astype("int32")
        wstart = te.floor(pw * bin_w + epsilon).astype("int32")
        hend = te.ceil((ph + 1) * bin_h - epsilon).astype("int32")
        wend = te.ceil((pw + 1) * bin_w - epsilon).astype("int32")
        hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height)
        wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width)
        hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height)
        wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width)

        non_empty = tvm.tir.all(hstart < hend, wstart < wend)
        min_value = lambda dtype: tvm.tir.if_then_else(
            non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype))
        # pylint: disable=unnecessary-lambda
        _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y),
                               min_value,
                               name="max")
        rh = te.reduce_axis((0, hend - hstart), "rh")
        rw = te.reduce_axis((0, wend - wstart), "rw")
        return _max(data[batch_index, c, hstart + rh, wstart + rw],
                    axis=[rh, rw])
Ejemplo n.º 11
0
def te_argmax_val_idx():
    def f_combine(x, y):
        lhs = tvm.tir.Select((x[0] >= y[0]), x[0], y[0])
        rhs = tvm.tir.Select((x[0] >= y[0]), x[1], y[1])
        return lhs, rhs

    def f_identity(dtype0: tvm.DataType, dtype1: tvm.DataType):
        return tvm.te.min_value(dtype0), tvm.tir.const(-1, dtype1)

    argmax = te.comm_reducer(f_combine, f_identity, name="argmax")

    m = te.var("m")
    n = te.var("n")
    val = te.placeholder((m, n), name="val", dtype="float32")
    idx = te.placeholder((m, n), name="idx", dtype="int32")
    k = te.reduce_axis((0, n), "k")
    max_val, max_idx = te.compute(
        (m,), lambda i: argmax((val[i, k], idx[i, k]), axis=k), name="argmax"
    )
    return [val, idx, max_val, max_idx]
Ejemplo n.º 12
0
def test_inline_multi_reduce():
    def argmax_comp(x, y):
        idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return idx, val

    def argmax_init(idx_typ, val_typ):
        return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ)

    argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax")
    m = te.var("m")
    n = te.var("n")
    val = te.placeholder((m, n), name="val", dtype="float32")
    val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1")
    val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2")
    k = te.reduce_axis((0, n), "k")
    T_idx, T_val = te.compute((m, ),
                              lambda i: argmax((k.var, val2[i, k]), axis=k),
                              name="T")
    s = te.create_schedule(T_idx.op)
    s[val1].compute_inline()
    s = s.normalize()
    bounds = tvm.te.schedule.InferBound(s)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
Ejemplo n.º 13
0
def test_tensor_comm_reducer_overload():
    m = te.size_var("m")
    n = te.size_var("n")
    mysum = te.comm_reducer(lambda x, y: x + y,
                            lambda t: tvm.tir.const(0, dtype=t))
    sum_res = mysum(m, n)
Ejemplo n.º 14
0
def test_warp_reduction2():
    """Test warp reductions."""

    def fcombine(tensor1, tensor2):
        return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1]

    def fidentity(tensor1, tensor2):
        return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2)

    add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer")

    # compute
    num_m = 16
    num_n = 256
    placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32")
    placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32")
    axis_k = te.reduce_axis((0, num_n), "k")
    result0, result1 = te.compute(
        (num_m,),
        lambda i: add_mul_reducer(
            (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k
        ),
        name="T",
    )

    nthdx, nthdy = 32, 2
    block_x = te.thread_axis("blockIdx.x")
    thread_x = te.thread_axis((0, nthdx), "threadIdx.x")
    thread_y = te.thread_axis((0, nthdy), "threadIdx.y")

    def check_target(device):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        # schedule
        schedule = te.create_schedule(result0.op)
        axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx)
        axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy)
        schedule[result0].bind(axis_ko, thread_x)
        schedule[result0].bind(axis_xi, thread_y)
        schedule[result0].bind(axis_xo, block_x)

        # validation
        dev = tvm.device(device, 0)
        a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype)
        a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype)
        t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype)
        t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype)
        buff_a0 = tvm.nd.array(a0_np, dev)
        buff_a1 = tvm.nd.array(a1_np, dev)
        buff_t0 = tvm.nd.array(t0_np, dev)
        buff_t1 = tvm.nd.array(t1_np, dev)
        func = tvm.build(
            schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction"
        )
        func(buff_a0, buff_a1, buff_t0, buff_t1)
        t0_np = np.sum(a0_np, axis=1)
        t1_np = np.product(a1_np, axis=1)
        tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
        tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)

    check_target("cuda")
    check_target("rocm")
def test_basic_operation():
    np.random.seed(0)
    shape = (10, 10)
    x = te.var("x", dtype='float32')
    k = te.reduce_axis((0, 10), name="k")
    l = te.reduce_axis((0, 10), name="l")
    A0 = te.placeholder(shape, name='A0')
    A1 = te.placeholder(shape, name='A1')
    zeros = np.zeros(shape)

    B = te.compute(shape, lambda i, j: A0[i, j], name='B')
    check_grad(B, [A0])

    B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name='B')
    check_grad(B, A0)

    B = te.compute(
        shape,
        lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))),
        name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(0.1, 10))

    B = te.compute(shape,
                   lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(-4, 4))

    B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute((10, ),
                   lambda i: te.sum(A0[i, k] * A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]),
                   name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape,
                   lambda i, j: te.sum(
                       A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k),
                   name='B')
    check_grad(B, A0)

    def fcombine(x, y):
        return x * y

    def fidentity(t0):
        return tvm.tir.const(1, t0)

    prod = te.comm_reducer(fcombine, fidentity, name='prod')
    B = te.compute((10, 10),
                   lambda i, j: prod(A0[i, k] + A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    X = te.placeholder((10, ), name='X')
    A = te.compute((10, ), lambda i: X[i] + X[9 - i])
    B = te.compute((10, ), lambda i: X[i] * X[9 - i])
    Y = topi.tensordot(A, B, 1)
    check_grad(Y, X)
Ejemplo n.º 16
0
B0, B1 = te.compute((m, n), lambda i, j: (A0[i,j]+2, A1[i,j]*3), name='B')
s = te.create_schedule(B0.op)
print(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))

# x and y are the operands of reduction, both of them is a tuple of index
# and value.
def fcombine(x, y):
    lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
    rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
    return lhs, rhs


# our identity element also need to be a tuple, so `fidentity` accepts
# two types as inputs.
def fidentity(t0, t1):
    return tvm.tir.const(-1, t0), tvm.te.min_value(t1)


argmax = te.comm_reducer(fcombine, fidentity, name="argmax")

# describe the reduction computation
m = te.var("m")
n = te.var("n")
idx = te.placeholder((m, n), name="idx", dtype="int32")
val = te.placeholder((m, n), name="val", dtype="int32")
k = te.reduce_axis((0, n), "k")
T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")

# the generated IR code would be:
s = te.create_schedule(T0.op)
print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True))
Ejemplo n.º 17
0
print(tvm.lower(s, [Input, Filter, Output], simple_mode=True))

######################################################################
# .. _general-reduction:
#
# Define General Commutative Reduction Operation
# ----------------------------------------------
# Besides the built-in reduction operations like :any:`te.sum`,
# :any:`tvm.te.min` and :any:`tvm.te.max`, you can also define your
# commutative reduction operation by :any:`te.comm_reducer`.
#

n = te.var("n")
m = te.var("m")
product = te.comm_reducer(lambda x, y: x * y,
                          lambda t: tvm.tir.const(1, dtype=t),
                          name="product")
A = te.placeholder((n, m), name="A")
k = te.reduce_axis((0, m), name="k")
B = te.compute((n, ), lambda i: product(A[i, k], axis=k), name="B")

######################################################################
# .. note::
#
#   Sometimes we would like to perform reduction that involves multiple
#   values like :code:`argmax`, which can be done by tuple inputs.
#   See :ref:`reduction-with-tuple-inputs` for more detail.

######################################################################
# Summary
# -------
Ejemplo n.º 18
0
def test_rfactor_argmax():
    """Test rfactor argmax"""

    def fcombine(tensor0, tensor1):
        lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0])
        rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1])
        return lhs, rhs

    def fidentity(tensor0, tensor1):
        return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1)

    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")

    num_width = 1027
    num_height = 10
    width = tvm.runtime.convert(num_width)
    height = tvm.runtime.convert(num_height)
    placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32")
    placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32")
    axis_k = te.reduce_axis((0, width))
    result_b0, result_b1 = te.compute(
        (height,),
        lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k),
        name="B",
    )

    # schedule
    schedule = te.create_schedule(result_b0.op)
    nthread = 16
    _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread)
    rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf)
    axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread)
    schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x"))
    schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y"))
    axis_tx = schedule[result_b0].op.reduce_axis[0]
    thread_x = te.thread_axis("threadIdx.x")
    schedule[result_b0].bind(axis_tx, thread_x)
    schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx)
    schedule[result_b0].set_store_predicate(thread_x.var.equal(0))

    def check_target(device):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return
        fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1])
        fargmax = tvm.build(fapi, target=device, name="argmax")

        np_idx = np.repeat(
            np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0
        )
        np_val = np.random.uniform(size=(num_height, num_width)).astype("float32")
        np_res = np.argmax(np_val, axis=1)

        nd_idx = tvm.nd.array(np_idx, dev)
        nd_val = tvm.nd.array(np_val, dev)
        nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev)
        nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev)
        fargmax(nd_idx, nd_val, nd_res0, nd_res1)
        tvm.testing.assert_allclose(np_res, nd_res0.numpy())

    check_target("cuda")
    check_target("vulkan")
    check_target("rocm")
Ejemplo n.º 19
0
def measure_bandwidth_sum(
    total_item,
    item_per_thread,
    stride,
    base_type,
    bits,
    lanes,
    target,
    target_host,
    remote,
    dev,
    n_times,
):
    """measure memory bandwidth of gpu by product reduction for a given type

    The IR for measurement is

    for each thread
        for i in 1..num_per_thread:
            y[global_id] = y[global_id] * x[base + i * stride]

    Parameters
    ----------
    total_item: int
        number of elements in input array
    item_per_thread: int
        number of elements each thread accumulates
    stride: int
        stride in memory access
    base_type: str
        can be "int", "float"
    bits: int
        can be 16, 32
    lanes: int
       lane of the vector type, can be 1, 2, 4, 8, 16
    target: :any:`tvm.target.Target`
        the target and option of the compilation.
    target_host : str or :any:`tvm.target.Target`
        host compilation target
    dev: Device
        the device of array
    remote: tvm.rpc.RPCSession
        remote rpc session
    n_times: int
        number of runs for taking mean

    Returns
    -------
    GBPS: float
         gigabyte per second
    """
    target, target_host = Target.check_and_update_host_consist(target, target_host)

    n, m = total_item, item_per_thread
    n //= lanes

    base_type = str(base_type) + str(bits)
    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)

    k = te.reduce_axis((0, m), name="k")

    x = te.placeholder((n,), dtype=dtype, name="x")
    op = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="sum")
    y = te.compute(
        (n // m,), lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k)
    )
    s = te.create_schedule(y.op)

    yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
    s[y].bind(yo, te.thread_axis("blockIdx.x"))
    s[y].bind(yi, te.thread_axis("threadIdx.x"))
    s[y].unroll(k)

    try:
        func = tvm.build(s, [x, y], target)

        x = tvm.nd.empty((n,), dtype=dtype, device=dev)
        y = tvm.nd.empty((n // m,), dtype=dtype, device=dev)

        func = _convert_to_remote(func, remote)
        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
        time = time_f(x, y).mean
    except tvm._ffi.base.TVMError:
        # build error (occur when device does not support half)
        return -1

    return 1.0 * (total_item * bits / 8) / 1e9 / time