Beispiel #1
0
def test_llvm_persist_parallel():
    n = 128
    A = tvm.placeholder((n, ), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
    C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=8)
    xo1, xo2 = s[C].split(xo, nparts=1)
    s[B].compute_at(s[C], xo1)
    s[B].parallel(s[B].op.axis[0])
    s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish")
    s[C].parallel(xi)
    s[C].pragma(xo1, "parallel_launch_point")
    s[C].pragma(xi, "parallel_stride_pattern")

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.sqrt(a.asnumpy() + 1) * 2 + 2,
                                    rtol=1e-5)

    check_llvm()
Beispiel #2
0
def test_llvm_persist_parallel():
    n = 128
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
    C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=8)
    xo1, xo2 = s[C].split(xo, nparts=1)
    s[B].compute_at(s[C], xo1)
    s[B].parallel(s[B].op.axis[0])
    s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish")
    s[C].parallel(xi)
    s[C].pragma(xo1, "parallel_launch_point")
    s[C].pragma(xi, "parallel_stride_pattern")

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                   np.sqrt(a.asnumpy() + 1) * 2 + 2,
                                   rtol=1e-5)

    check_llvm()
Beispiel #3
0
def sqrt(x):
    """Take square root of input x.

    Parameters
    ----------
    x : tvm.Tensor
        Input argument.

    Returns
    -------
    y : tvm.Tensor
        The result.
    """
    return tvm.compute(x.shape, lambda *i: tvm.sqrt(x(*i)))
Beispiel #4
0
Datei: math.py Projekt: gwli/tvm
def sqrt(x):
    """Take square root of input x.

    Parameters
    ----------
    x : tvm.Tensor
        Input argument.

    Returns
    -------
    y : tvm.Tensor
        The result.
    """
    return tvm.compute(x.shape, lambda *i: tvm.sqrt(x(*i)))
Beispiel #5
0
def sort_ir(data, index, output, axis, is_descend):
    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.

    Parameters
    ----------
    data: Buffer
        2D Buffer of input boxes' score with shape [batch_size, num_anchors].

    index : Buffer
        Buffer of number of valid number of boxes.

    output : Buffer
        Output buffer of indicies of sorted tensor.

    axis : int
        The axis used for sorting.

    is_descend : bool
        If the sorted data is in descending order.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """

    max_threads = int(
        tvm.target.current_target(allow_none=False).max_num_threads)
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib = tvm.ir_builder.create()
    p_data = ib.buffer_ptr(data)
    p_index = ib.buffer_ptr(index)
    p_out = ib.buffer_ptr(output)
    ndim = len(data.shape)
    assert data.dtype == "float32", "Currently only supports input dtype to be float32"
    assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim

    axis_mul_before = 1
    axis_mul_after = 1
    if axis < 0:
        axis = ndim + axis
    for i in range(0, ndim):
        if i < axis:
            axis_mul_before *= data.shape[i]
        elif i > axis:
            axis_mul_after *= data.shape[i]

    dshape = 0
    for i in range(0, len(index.shape)):
        dshape += index.shape[i]
    dshape = tvm.select(dshape > axis_mul_before * axis_mul_after, dshape,
                        axis_mul_before * axis_mul_after)

    sizes_temp = ib.allocate("int32",
                             dshape,
                             name="sizes_temp",
                             scope="global")
    sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
    temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local")
    temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local")
    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
    nthread_tx = max_threads
    nthread_bx = dshape // max_threads + 1
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    tid = bx * max_threads + tx

    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
        sizes[tid] = p_index[tid]
        sizes_temp[tid] = p_index[tid]

    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \
             .astype("float32"))) + 1, name="k") as k:
            with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0):
                with ib.if_scope(k % 2 == 0):
                    sizes[tid] += sizes_temp[tid -
                                             (tvm.const(1, "int32") << k)]
                    sizes_temp[tid] = sizes[tid]
                with ib.else_scope():
                    sizes_temp[tid] += sizes[tid -
                                             (tvm.const(1, "int32") << k)]
                    sizes[tid] = sizes_temp[tid]

    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
        i = tid / axis_mul_after
        j = tid % axis_mul_after
        current_sort_num = p_index[tid]
        base_idx = i * data.shape[axis] * axis_mul_after + j
        with ib.for_range(0, current_sort_num, name="k") as k:
            full_idx = base_idx + k * axis_mul_after
            with ib.if_scope(tid == 0):
                start = 0
            with ib.else_scope():
                start = sizes[tid - 1]
            index_new[start + k] = k
            data_new[start + k] = p_data[full_idx]

    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
        with ib.if_scope(tid == 0):
            start = 0
        with ib.else_scope():
            start = sizes[tid - 1]
        # OddEvenTransposeSort
        with ib.for_range(0, p_index[tid], name="k") as k:
            with ib.for_range(0, p_index[tid] - 1, name="i") as i:
                with ib.if_scope(i % 2 == (k & 1)):
                    with ib.if_scope(
                        ((data_new[i + start] < data_new[i + start + 1])
                         ^ is_descend) == False):
                        temp_data[tid] = data_new[i + start]
                        data_new[i + start] = data_new[i + start + 1]
                        data_new[i + start + 1] = temp_data[tid]
                        temp_index[tid] = index_new[i + start]
                        index_new[i + start] = index_new[i + start + 1]
                        index_new[i + start + 1] = temp_index[tid]

    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
        i = tid / axis_mul_after
        j = tid % axis_mul_after
        current_sort_num = p_index[tid]
        base_idx = i * data.shape[axis] * axis_mul_after + j
        with ib.for_range(0, data.shape[axis], name="k") as k:
            with ib.if_scope(tid == 0):
                start = 0
            with ib.else_scope():
                start = sizes[tid - 1]
            p_out[base_idx + k * axis_mul_after] = tvm.select(
                k < current_sort_num, index_new[k + start], k)
    body = ib.get()
    return body
Beispiel #6
0
def BatchNorm(device="llvm",
              lib_path="./",
              ndim=None,
              dtype=None,
              optype=False,
              axis=None):
    '''
    batchnorm
    Args:
        device:
        lib_path:
        ndim:
        dtype:
        optype:
        axis:

    Returns:
    '''
    if axis >= ndim:
        return
    shape = [tvm.var("n" + str(i)) for i in range(ndim)]
    channel = shape[axis]
    eps = tvm.var("epsilon", dtype="float32")
    opname = optype + ("_ndim%d_%s_axis%d" % (ndim, dtype, axis))
    print(opname)

    # define compute
    in_tensor = tvm.placeholder(shape, dtype=dtype, name='in_tensor')
    mean = tvm.placeholder((channel, ), dtype=dtype, name='mean')
    variance = tvm.placeholder((channel, ), dtype=dtype, name='var')
    scale = tvm.placeholder((channel, ), dtype=dtype, name='scale')
    offset = tvm.placeholder((channel, ), dtype=dtype, name='offset')

    variance_sqrt = tvm.compute(
        (channel, ), lambda i: tvm.sqrt(variance[i] + eps.astype(dtype)))
    if optype == "TFBatchNorm":
        out_tensor = tvm.compute(shape, lambda *idx: ((in_tensor[idx] - mean[idx[axis]]) / variance_sqrt[idx[axis]]) *\
                                                     scale[idx[axis]] + offset[idx[axis]])
        tensor_list = [
            eps, in_tensor, scale, offset, mean, variance, out_tensor
        ]
    elif optype == "CaffeBatchNorm":
        out_tensor = tvm.compute(
            shape, lambda *idx:
            (in_tensor[idx] - mean[idx[axis]]) / variance_sqrt[idx[axis]])
        tensor_list = [eps, in_tensor, mean, variance, out_tensor]
    elif optype == "CaffeScale":
        out_tensor = tvm.compute(
            shape,
            lambda *idx: in_tensor[idx] * scale[idx[axis]] + offset[idx[axis]])
        tensor_list = [in_tensor, scale, offset, out_tensor]
    elif optype == "TFBiasAdd":
        out_tensor = tvm.compute(
            shape, lambda *idx: in_tensor[idx] + offset[idx[axis]])
        tensor_list = [in_tensor, offset, out_tensor]
    else:
        raise RuntimeError("no support for {}".format(optype))

    # define schedule & generate lib
    s = tvm.create_schedule(out_tensor.op)
    Genlib(s, tensor_list, device, opname, lib_path)