Beispiel #1
0
def verify_log_softmax(m, n):
    A = tvm.placeholder((m, n), name='A')
    B = topi.nn.log_softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)
    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.log_softmax_python(a_np)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_softmax(B)
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        foo = tvm.build(s, [A, B], device, name="log_softmax")
        foo(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for device in ["opengl"]:
        check_device(device)
Beispiel #2
0
def verify_softmax(m, n):
    A = tvm.placeholder((m, n), name='A')
    B = topi.cpp.nn.softmax(A, 1)
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)

    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.softmax_python(a_np)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        target = topi.cpp.TEST_create_target(device)
        if device == "llvm":
            s = topi.cpp.generic.default_schedule(target, [B], False)
        else:
            s = topi.cpp.cuda.schedule_softmax(target, [B])
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        foo = tvm.build(s, [A, B], device, name="softmax")
        foo(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for device in ['cuda', 'opencl', 'metal', 'rocm']:
        check_device(device)
Beispiel #3
0
def verify_softmax(m, n, dtype="float32"):
    A = tvm.placeholder((m, n), dtype=dtype, name='A')
    B = topi.nn.softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)

    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.softmax_python(a_np)

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_softmax(B)

        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        foo = tvm.build(s, [A, B], device, name="softmax")
        foo(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)
Beispiel #4
0
def test_lstm_cell_inline():
    num_step = 128
    num_input = 256
    num_hidden = 1152
    batch_size = 4
    # Global transition matrix
    X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
    Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
    s_init_c = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_c")
    s_init_h = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_h")
    # LSTM transition
    k = tvm.reduce_axis((0, num_input), name="ki2h")
    s_i2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
        name="s_i2h")
    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
    s_h2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
        name="s_h2h")
    # Gate rules
    gates = tvm.compute(s_i2h.shape, lambda *i:
                        s_i2h(*i) + s_h2h(*i), name="gates")
    gshape = (num_step, batch_size, num_hidden)
    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
    next_c = tvm.compute(gshape,
                         lambda t, i, j:
                         forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                         in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
    next_h = tvm.compute(gshape,
                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
    # schedule
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h, update_c],
        [s_state_h, s_state_c],
        inputs=[X],
        name="lstm_scan")
    # schedule
    s = tvm.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()
    # verify we can lower correctly
    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
def test_loop_dependent_allocate():
    N = tvm.var("N")
    A = tvm.placeholder((2*N,), "float32", "A")
    C = tvm.compute((N, ), lambda i: A[2*i] + A[i+1], name='C')
    s = tvm.create_schedule(C.op)
    AA = s.cache_read(A, "local", [C])
    s[AA].compute_at(s[C], s[C].op.axis[0])
    # this line should fail due to IRUseDefAnalysis sees an allocate statement
    # referencing undefined variable
    tvm.lower(s, [A,C])
Beispiel #6
0
def verify_log_softmax(m, n, dtype="float32"):
    A = tvm.placeholder((m, n), dtype=dtype, name='A')
    B = topi.nn.log_softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)
    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.log_softmax_python(a_np)

    for device in get_all_backend():
        check_device(A, B, a_np, b_np, device, "log_softmax")
Beispiel #7
0
def verify_softmax(m, n, dtype="float32"):
    A = tvm.placeholder((m, n), dtype=dtype, name='A')
    B = topi.nn.softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)

    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.softmax_python(a_np)

    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(A, B, a_np, b_np, device, "softmax")
Beispiel #8
0
def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter,
                  out_filter, k_h, k_w, hpad, wpad, hstride, wstride):
    """
    Runs the inference and checks the functional correctness between
    compute and schedule outputs
    """
    (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter,
                                                    out_filter, k_h, k_w, hpad, wpad,
                                                    hstride, wstride, out_dtype)

    # Create TVM placeholders
    data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)

    # Create the numpy arrays to be used for executing conv models
    if data_dtype == 'float32':
        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
    else:
        data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
        kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))

    # c_orig will be used for declaration ouptut
    # c_sch will be used for scheduled computation output
    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)


    with tvm.target.create(TARGET_NAME):
        conv = topi.nn.conv2d_NCHWc(data, kernel, stride=hstride,
                                    padding=hpad, layout='NCHWc',
                                    out_layout='NCHWc', out_dtype=out_dtype)
        out = topi.nn.relu(conv)
        sch = tvm.create_schedule(out.op)
        func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out')
        func(data_array, kernel_array, c_orig)
        LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))

        # Generate and run the optimized schedule
        sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
        func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv')
        func(data_array, kernel_array, c_sch)

        # Functional check
        if data_dtype == 'uint8':
            np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
        else:
            assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())

        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
        LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
        return evaluator(data_array, kernel_array, c_sch).mean
Beispiel #9
0
def test_add_pipeline():
    nn = 64
    max_threads = 4
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, (n+1) // 2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    def extern_generator_gpu(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        with ib.if_scope(ib.likely(idx < n)):
            ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
    C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
    s_cpu = tvm.create_schedule(C_cpu.op)
    s_gpu = tvm.create_schedule(C_gpu.op)
    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
        C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
        
    check_target("llvm")
    check_target("opencl")
    check_target("cuda")
    def check_device(device, target_device):
        if not tvm.module.enabled(target_device):
            print("Skip test because {} is not enabled.".format(target_device))
            return

        device_ctx = tvm.context(device)
        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
        shape = (4,)

        # Create module for add whose target is the device.
        tensor_a = tvm.placeholder(shape, name="A")
        tensor_b = tvm.placeholder(shape, name="B")
        elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i)
                                   + tensor_b(*i), name="elemwise_add")
        target = topi.cpp.TEST_create_target(device)
        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
                              name="elemwise_add")

        # Insert copy. Neither compute nor schedule is required for the copy
        # node. The compute will be performed at runtime which is just data
        # copy from the input to the output.
        tensor_copy = tvm.placeholder(shape, name="__copy")

        # Create module for sub whose target is the host.
        tensor_c = tvm.placeholder(shape, name="C")
        elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i)
                                   - tensor_c(*i), name="elemwise_sub")
        schedule_sub = tvm.create_schedule(elemwise_sub.op)
        lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c,
                                             elemwise_sub],
                              name="elemwise_sub")

        target_flist = {target_device: [lower_add], target_host: [lower_sub]}
        mhost = tvm.build(target_flist, target_host=target_host)
        ctx = [host_ctx, device_ctx]
        mod = graph_runtime.create(graph, mhost, ctx)
        params = {}
        params["A"] = tensor_a = np.random.uniform(
            size=shape).astype(tensor_a.dtype)
        params["B"] = tensor_b = np.random.uniform(
            size=shape).astype(tensor_b.dtype)
        params["C"] = tensor_c = np.random.uniform(
            size=shape).astype(tensor_c.dtype)
        mod.set_input(**params)
        mod.run()
        out = mod.get_output(0, tvm.nd.empty(shape))
        np.testing.assert_equal(
            out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
Beispiel #11
0
def _lower(sch, inputs, func_name, graph):
    import traceback
    # pylint: disable=broad-except
    try:
        f = tvm.lower(sch, inputs, name=func_name)
        logging.debug("lower function %s", func_name)
        logging.debug("%s", tvm.lower(sch, inputs, simple_mode=True))
    except Exception:
        msg = traceback.format_exc()
        msg += "Error during compile graph\n"
        msg += "--------------------------\n"
        msg += graph.ir(join_entry_attrs=["shape"])
        raise RuntimeError(msg)
    return f if isinstance(
        f, (tvm.container.Array, tuple, list)) else [f]
Beispiel #12
0
def lower(*args, **kwargs):
    """Thin wrapper of tvm.lower

    This wrapper automatically applies VTA's build_config
    if there is no user specified build_config in context.

    See Also
    --------
    tvm.lower : The original TVM's lower function
    """
    cfg = tvm.build_module.current_build_config()
    if not cfg.add_lower_pass:
        with build_config():
            return tvm.lower(*args, **kwargs)
    return tvm.lower(*args, **kwargs)
Beispiel #13
0
def test_local_gemm():
    if not tvm.module.enabled("opengl"):
        return
    if not tvm.module.enabled("llvm"):
        return

    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A', dtype='int32')
    B = tvm.placeholder((m, l), name='B', dtype='int32')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
                    name='CC')

    s = tvm.create_schedule(C.op)
    s[C].opengl()
    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opengl", name="gemm")
    print("------opengl code------")
    print(f.imported_modules[0].get_source(fmt="gl"))

    ctx = tvm.opengl()
    n, m, l = nn, nn, nn
    a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
    f(a, b, c)

    tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
Beispiel #14
0
def test_in_bounds_conv_llvm(loop_tiling=False):
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    if loop_tiling:
        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
    print (lowered_func.body)
    ctx = tvm.cpu (0)

    f = tvm.build(s, [data, kernel, conv], "llvm")
    data_input = tvm.nd.array(np.random.uniform(
          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
    kernel_input = tvm.nd.array(np.random.uniform(
          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
    f(data_input, kernel_input, conv_out)
Beispiel #15
0
def test_in_bounds_vectorize_llvm():
    n = 512
    lanes = 2
    A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
    B = tvm.compute((n,), lambda i: A[i], name='B')
    C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], nparts=2)
    _, xi = s[C].split(xi, factor=2)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    s[B].compute_at(s[C], xo)
    xo, xi = s[B].split(B.op.axis[0], factor=2)
    s[B].vectorize(xi)
    # build and invoke the kernel.
    lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False)
    print (lowered_func.body)
    f = tvm.build(s, [A, C], "llvm")
    ctx = tvm.cpu(0)
    # launch the kernel.
    a = tvm.nd.empty((n,), A.dtype).copyfrom(
        np.random.uniform(size=(n, lanes)))
    c = tvm.nd.empty((n,), C.dtype, ctx)
    f(a, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
Beispiel #16
0
def test_upstream():
    @tvm.hybrid.script
    def upstream(a):
        b = output_tensor((20, ), 'float32')
        for i in range(20):
            b[i] = a[i] * i
        return b

    a = tvm.placeholder((20, ), 'float32')
    b = tvm.placeholder((20, ), 'float32')
    c = tvm.compute((20, ), lambda x: a[x] + b[x])
    d = upstream(c)
    sch = tvm.create_schedule([c.op, d.op])
    ir = tvm.lower(sch, [a, b, d], simple_mode=True)
    func = tvm.build(sch, [a, b, d])
    assert(func)

    a = numpy.random.randn(20).astype('float32')
    b = numpy.random.randn(20).astype('float32')
    ref = numpy.zeros((20, ), 'float32')
    for i in range(20):
        ref[i] = (a[i] + b[i]) * i

    tvm_a = tvm.nd.array(a)
    tvm_b = tvm.nd.array(b)
    tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32'))

    func(tvm_a, tvm_b, tvm_d)
    tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
Beispiel #17
0
def test_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, n/2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C = tvm.extern(A.shape, [A], extern_generator, name='C')
    s = tvm.create_schedule(C.op)
    print(tvm.lower(s, [A, C], simple_mode=True))

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        np.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + 1)
    check_llvm()
Beispiel #18
0
def test_double_splitting_with_indivisible_factors():
    m = 48
    dtype="float32"
    A = tvm.placeholder((m,), name='A', dtype=dtype)
    C = tvm.compute((m,), lambda i: A[i], name='C')
    D = tvm.compute((m,), lambda i: C[i], name='D')

    s = tvm.create_schedule(D.op)
    co, ci = s[C].split(C.op.axis[0], factor=10)
    do, di = s[D].split(D.op.axis[0], 32)
    s[C].compute_at(s[D], do)

    target = 'llvm'
    with tvm.build_config(partition_const_loop=True):
        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
        func = tvm.build(f, target=target)

    # Find the beginning of the Halide IR corresponding to kernel code
    # and make sure it doesn't have an if statements left
    top_produce = find_top_produce(f.body)
    assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse))))

    # check functional correctness of generated code
    ctx = tvm.context(target, 0)
    a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx)
    c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    func(a, c, d)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
    tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
Beispiel #19
0
 def check_c():
     if not tvm.module.enabled("llvm"):
         return
     # Specifically allow offset to test codepath when offset is available
     Ab = tvm.decl_buffer(
         A.shape, A.dtype,
         elem_offset=tvm.var('Aoffset'),
         offset_factor=8,
         name='A')
     binds = {A : Ab}
     # BUILD and invoke the kernel.
     f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
     fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
     fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
     mhost = tvm.codegen.build_module(fsplits[0], "c")
     temp = util.tempdir()
     path_dso = temp.relpath("temp.so")
     mhost.export_library(path_dso)
     m = tvm.module.load(path_dso)
     fadd = m["fadd_pipeline"]
     ctx = tvm.cpu(0)
     # launch the kernel.
     n = nn
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     fadd(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), a.asnumpy() + b.asnumpy())
 def conv_normal(print_ir):
     print("----- CONV2D CPU End-to-End Test-------")
     s = topi.generic.schedule_conv2d_nchw([res])
     if print_ir:
         print(tvm.lower(s, [data, kernel, res], simple_mode=True))
     cost = verify(s, True)
     gops = (num_ops / cost.mean) / float(10 ** 9)
     print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
Beispiel #21
0
        def run_schedule(load_inp,
                         load_wgt,
                         gemm,
                         alu,
                         store_out,
                         print_ir,
                         check_correctness):
            s = tvm.create_schedule(res.op)
            s[data_buf].set_scope(env.inp_scope)
            s[weight_buf].set_scope(env.wgt_scope)
            s[res_gem].set_scope(env.acc_scope)
            s[res_shf].set_scope(env.acc_scope)
            s[res_min].set_scope(env.acc_scope)
            s[res_max].set_scope(env.acc_scope)

            if block:
                bblock = block // env.BATCH
                iblock = block // env.BLOCK_IN
                oblock = block // env.BLOCK_OUT
                xbo, xco, xbi, xci = s[res].op.axis
                xb1, xco1, xb2, xco2 = s[res].tile(xbo, xco, bblock, oblock)
                store_pt = xb2

                s[res_gem].compute_at(s[res], xco1)
                s[res_shf].compute_at(s[res], xco1)
                s[res_min].compute_at(s[res], xco1)
                s[res_max].compute_at(s[res], xco1)

                xbo, xco, xbi, xci = s[res_gem].op.axis
                # Compute one line at a time
                ko1, ko2 = s[res_gem].split(ko, iblock)
                s[res_gem].reorder(ko1, ko2, xbo, xco, xbi, xci, ki)
                s[data_buf].compute_at(s[res_gem], ko1)
                s[weight_buf].compute_at(s[res_gem], ko1)
                # Use VTA instructions
                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
                s[res_gem].tensorize(xbi, gemm)
                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
                s[res_min].pragma(s[res_min].op.axis[0], alu)
                s[res_max].pragma(s[res_max].op.axis[0], alu)
                s[res].pragma(store_pt, store_out)
            else:
                xbo, xco, xbi, xci = s[res_gem].op.axis
                s[res_gem].reorder(ko, xbo, xco, xbi, xci, ki)
                # Use VTA instructions
                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
                s[res_gem].tensorize(xbi, gemm)
                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
                s[res_min].pragma(s[res_min].op.axis[0], alu)
                s[res_max].pragma(s[res_max].op.axis[0], alu)
                s[res].pragma(s[res].op.axis[0], store_out)


            if print_ir:
                print(tvm.lower(s, [data, weight, res], simple_mode=True))
            return verify(s, check_correctness)
Beispiel #22
0
def main():
    n = tvm.var('n')
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    s[C].parallel(s[C].op.axis[0])
    print(tvm.lower(s, [A, B, C], simple_mode=True))
    tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
Beispiel #23
0
 def check(factor):
     s = tvm.create_schedule(z.op)
     xo, xi = s[z].split(z.op.axis[0], factor=factor)
     vadd = intrin_vadd(factor)
     s[z].tensorize(xi, vadd)
     s = s.normalize()
     dom_map = tvm.schedule.InferBound(s)
     finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
     out_dom, in_dom = finfer(s[z], dom_map)
     assert tvm.ir_pass.Equal(out_dom[z.op.axis[0]].extent, factor)
     assert tvm.ir_pass.Equal(out_dom[z.op.axis[0]].min, xo * factor)
     assert tvm.ir_pass.Equal(in_dom.items()[0][1][0].extent, factor)
     fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
     body = fmatch(s[z], out_dom, in_dom, vadd)
     assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                              tvm.ir_pass.CanonicalSimplify(vadd.op.body[0]))
     stmt = tvm.schedule.ScheduleOps(s, dom_map)
     tvm.lower(s, [x, y, z])
Beispiel #24
0
 def check(factor):
     s = tvm.create_schedule(C.op)
     x, y = C.op.axis
     yo, yi = s[C].split(y, factor=factor)
     gemv = intrin_gemv(factor, l)
     s[C].tensorize(yi, gemv)
     s = s.normalize()
     dom_map = tvm.schedule.InferBound(s)
     finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
     out_dom, in_dom = finfer(s[C], dom_map)
     assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
     assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
     assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
     fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
     body = fmatch(s[C], out_dom, in_dom, gemv)
     assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                              tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
     stmt = tvm.schedule.ScheduleOps(s, dom_map)
     tvm.lower(s, [A, B, C])
Beispiel #25
0
def prepare_test_libs(base_path):
    n = tvm.var('n')
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
    s = tvm.create_schedule(B.op)
    s[B].parallel(s[B].op.axis[0])
    print(tvm.lower(s, [A, B], simple_mode=True))

    # Compile library in system library mode
    fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib', name='addonesys')
    syslib_path = osp.join(base_path, 'test_addone_sys.o')
    fadd_syslib.save(syslib_path)
Beispiel #26
0
 def check_llvm():
     if not tvm.module.enabled("llvm"):
         return
     # build two functions
     f2 = tvm.lower(s, [A, B, C], name="fadd1")
     f1 = tvm.lower(s, [A, B, C], name="fadd2")
     m = tvm.build([f1, f2], "llvm")
     fadd1 = m['fadd1']
     fadd2 = m['fadd2']
     ctx = tvm.cpu(0)
     # launch the kernel.
     n = nn
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     fadd1(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), a.asnumpy() + b.asnumpy())
     fadd2(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), a.asnumpy() + b.asnumpy())
Beispiel #27
0
 def check_rfactor_no_reset_multi_reduction(factor, rfactor):
     s = tvm.create_schedule(C.op)
     x, y = C.op.axis
     rk = C.op.reduce_axis[0]
     yo, yi = s[C].split(y, factor=factor)
     ro, ri = s[C].split(rk, factor=rfactor)
     roo, roi = s[C].split(ro, factor=2)
     s[C].reorder(yo, roo, roi, yi, ri)
     gemv = intrin_gemv_no_reset(factor, rfactor)
     s[C].tensorize(yi, gemv)
     s = s.normalize()
     dom_map = tvm.schedule.InferBound(s)
     finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
     out_dom, in_dom = finfer(s[C], dom_map)
     assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
     assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
     assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
     fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
     body = fmatch(s[C], out_dom, in_dom, gemv)
     assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                              tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
     stmt = tvm.schedule.ScheduleOps(s, dom_map)
     tvm.lower(s, [A, B, C])
Beispiel #28
0
def _lower(sch, inputs, func_name, graph):
    import traceback
    # pylint: disable=broad-except
    try:
        f = tvm.lower(sch, inputs, name=func_name)
        if "quantized_conv2d" in func_name:
            logging.info(graph.ir(join_entry_attrs=["shape"]))
    except Exception:
        msg = traceback.format_exc()
        msg += "Error during compile graph\n"
        msg += "--------------------------\n"
        msg += graph.ir(join_entry_attrs=["shape"])
        raise RuntimeError(msg)
    return f if isinstance(
        f, (tvm.container.Array, tuple, list)) else [f]
Beispiel #29
0
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_pool(B)
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
        print(tvm.lower(s, [A, B], simple_mode=True))

        f = tvm.build(s, [A, B], device)
        f(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
Beispiel #30
0
def test_lower_rfactor():
    n = tvm.var("n")
    m = tvm.var("m")
    A = tvm.placeholder((n, m), name='A')
    k = tvm.reduce_axis((0, m), "k")
    B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
    BF = s.rfactor(B, ki)
    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
    s[B.op].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[B.op].bind(xi, tvm.thread_axis("threadIdx.y"))
    s[B].bind(s[B].op.reduce_axis[0], tvm.thread_axis("threadIdx.x"))
    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
    fapi = tvm.lower(s, [A, B])
Beispiel #31
0
def test_tensor_intrin_scalar_params():
    n = te.size_var("n")
    x = te.placeholder((n, ), name="x")
    v = te.size_var("v")
    w = te.size_var("w")
    z = te.compute((n, ), lambda i: x[i] * v + w, name="z")

    def intrin_func(ins, outs, sp):
        assert isinstance(ins[0], tvm.te.schedule.Buffer)
        assert ins[0].shape[0] == n
        assert sp[0] == v
        assert sp[1] == w
        return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0],
                                   sp[1])

    intrin = te.decl_tensor_intrin(z.op,
                                   intrin_func,
                                   scalar_params=[v, w],
                                   default_buffer_params={"offset_factor": 1})
    assert intrin.op == z.op
    assert intrin.reduce_init is None
    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
    assert intrin.buffers[0].shape[0] == n
    assert tuple(intrin.scalar_params) == tuple((v, w))

    A = te.placeholder((10, 10), name="A")
    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
    C = te.compute((10, 10),
                   lambda i, j: intrin(i * i, A[i, j], i + j),
                   name="C")
    s = te.create_schedule(C.op)
    stmt = tvm.lower(s, [A, C])["main"].body
    assert isinstance(stmt.body.body, tvm.tir.Evaluate)
    assert len(stmt.body.body.value.args) == 5
    assert str(stmt.body.body.value.args[3]) == "(i: int32*i)"
    assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
    def mod(self, target, load_type, store_type, indirect_indices):
        target = tvm.target.Target(target)

        n = 4
        dtype = "int32"
        A = te.placeholder((n, ), dtype=dtype, name="A")
        R = te.placeholder((n, ), dtype=dtype, name="R")

        def do_compute(ins, outs):
            ib = tvm.tir.ir_builder.create()
            A, R = map(ib.buffer_ptr, ins)
            B = ib.buffer_ptr(outs[0])

            if "gpu" in target.keys:
                ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0)

            index_map = {
                "ramp": tvm.tir.Ramp(0, 1, 4),
                "broadcast": tvm.tir.Broadcast(0, 4),
            }

            load_index = index_map[load_type]
            store_index = index_map[store_type]

            if indirect_indices:
                load_index = tvm.tir.expr.Load("int32x4", R, load_index)

            transfer = tvm.tir.expr.Load("int32x4", A, load_index)
            ib.emit(tvm.tir.stmt.Store(B, transfer, store_index))

            return ib.get()

        B = te.extern(A.shape, [A, R], do_compute, dtype="int32")
        s = te.create_schedule(B.op)

        return tvm.lower(s, [A, R, B])
def test_lower_warp_memory_same_thread():
    m = n = 128
    A = te.placeholder((m, n), name="A")
    k = te.reduce_axis((0, n), name="k")
    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=[k]))

    s = te.create_schedule(B.op)
    BB = s.cache_write(B, "warp")
    tx = te.thread_axis("threadIdx.x")
    xo, xi = s[B].split(B.op.axis[0], factor=32)
    s[B].bind(xi, tx)
    s[B].bind(xo, te.thread_axis("blockIdx.x"))
    s[BB].compute_at(s[B], xo)
    xo, xi = s[BB].split(s[BB].op.axis[0], factor=32)
    s[BB].bind(xi, tx)

    cuda_target = tvm.target.Target("cuda")
    assert cuda_target.thread_warp_size == 32
    mod = tvm.lower(s, [A, B], name="f")
    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", cuda_target))(mod)
    fdevice = tvm.tir.transform.SplitHostDevice()(mod)["f_kernel0"]
    mod = tvm.IRModule.from_expr(fdevice)
    fdevice = tvm.tir.transform.LowerWarpMemory()(mod)["f_kernel0"]
    assert "tvm_warp_shuffle" not in fdevice.astext()
def test_multilevel_splitting_with_indivisble_factors():
    from tvm import topi

    A = te.placeholder((130, ), dtype="float32")
    B = topi.nn.relu(A)
    s = te.create_schedule(B.op)
    (y, ) = s[B].op.axis
    (yo, yi) = s[B].split(y, factor=8)
    (yoo, yoi) = s[B].split(yo, factor=16)
    s[B].reorder(yoo, yoi, yi)
    s[B].unroll(yi)

    ## But this does the right thing.
    with tvm.transform.PassContext(
            config={"tir.LoopPartition": {
                "partition_const_loop": True
            }}):
        lowered_body = tvm.lower(s, [A, B], name="x")["x"].body

        def visit_stmt(op):
            return isinstance(op, tvm.tir.Max)

        num_max = collect_visit(lowered_body, visit_stmt)
        assert num_max.count(True) == 10
def test_in_bounds_vectorize_llvm():
    n = 512
    lanes = 2
    A = te.placeholder((n, ), name='A', dtype="float32x%d" % lanes)
    B = te.compute((n, ), lambda i: A[i], name='B')
    C = te.compute((n, ), lambda i: B[i] + tvm.tir.const(1, A.dtype), name='C')
    s = te.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], nparts=2)
    _, xi = s[C].split(xi, factor=2)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    s[B].compute_at(s[C], xo)
    xo, xi = s[B].split(B.op.axis[0], factor=2)
    s[B].vectorize(xi)
    # build and invoke the kernel.
    lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False)
    f = tvm.build(s, [A, C], "llvm")
    ctx = tvm.cpu(0)
    # launch the kernel.
    a = tvm.nd.empty((n, ),
                     A.dtype).copyfrom(np.random.uniform(size=(n, lanes)))
    c = tvm.nd.empty((n, ), C.dtype, ctx)
    f(a, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
Beispiel #36
0
def test_tensor_intrin_scalar_params():
    n = tvm.size_var("n")
    x = tvm.placeholder((n, ), name='x')
    v = tvm.size_var("v")
    w = tvm.size_var("w")
    z = tvm.compute((n, ), lambda i: x[i] * v + w, name='z')

    def intrin_func(ins, outs, sp):
        assert (isinstance(ins[0], tvm.schedule.Buffer))
        assert (ins[0].shape[0] == n)
        assert (sp[0] == v)
        assert (sp[1] == w)
        return tvm.call_packed("hw_func", ins[0].data, outs[0].data, sp[0],
                               sp[1])

    with tvm.build_config(offset_factor=1):
        intrin = tvm.decl_tensor_intrin(z.op,
                                        intrin_func,
                                        scalar_params=[v, w])
    assert intrin.op == z.op
    assert intrin.reduce_init is None
    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
    assert (intrin.buffers[0].shape[0] == n)
    assert tuple(intrin.scalar_params) == tuple((v, w))

    A = tvm.placeholder((10, 10), name='A')
    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
    C = tvm.compute((10, 10),
                    lambda i, j: intrin(i * i, A[i, j], i + j),
                    name="C")
    s = tvm.create_schedule(C.op)
    stmt = tvm.lower(s, [A, C], simple_mode=True)
    assert isinstance(stmt.body.body.body, tvm.tir.Evaluate)
    assert len(stmt.body.body.body.value.args) == 5
    assert str(stmt.body.body.body.value.args[3]) == "(i*i)"
    assert str(stmt.body.body.body.value.args[4]) == "(i + j)"
Beispiel #37
0
def test_local_gemm():
    if not tvm.module.enabled("opengl"):
        return
    if not tvm.module.enabled("llvm"):
        return

    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A', dtype='int32')
    B = tvm.placeholder((m, l), name='B', dtype='int32')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m),
                    lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
                    name='CC')

    s = tvm.create_schedule(C.op)
    s[C].opengl()
    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opengl", name="gemm")
    print("------opengl code------")
    print(f.imported_modules[0].get_source(fmt="gl"))

    ctx = tvm.opengl()
    n, m, l = nn, nn, nn
    a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
    f(a, b, c)

    np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
def test_large_input():
    @te.hybrid.script
    def compute(a, b):
        n = 16384
        c = output_tensor((n, n), "int32")
        for i in range(n):
            for j in range(n):
                c[i, j] = a[i, j] - b[i, j]
        return c

    n = 16384
    shape = (n, n)
    a = te.placeholder(shape, name="a", dtype="int32")
    b = te.placeholder(shape, name="b", dtype="int32")
    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
    c = te.compute(shape, lambda i, j: 1 + c[i, j])
    s = te.create_schedule(c.op)
    stmt = tvm.lower(s, [a, b, c])["main"].body

    def verify(n):
        if isinstance(n, tvm.tir.Allocate):
            assert n.extents[0].value == 268435456

    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
Beispiel #39
0
def test_scan_inline2():
    m = te.var("m")
    n = te.var("n")
    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
    s_state1 = te.placeholder((m, n))
    s_state2 = te.placeholder((m, n))
    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
    s_xx = te.compute((m, n),
                      lambda t, i: s_state1[t - 1, i] + x[t, i],
                      name="xx")
    s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1")
    s_x2 = te.compute((m, n),
                      lambda t, i: s_xx[t, i] + s_state2[t - 1, 2],
                      name="x2")
    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
    res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2],
                             [s_state1, s_state2])
    s = te.create_schedule(res1.op)
    s[s_xx].compute_inline()
    s[s_x1].compute_inline()
    s[s_x2].compute_inline()
    stmt = tvm.lower(s, [x, res1, res2])
def try_yolo_conv(batch_size, config):
    global __COUNTER__
    __COUNTER__ += 1
    # get the compute
    yolo_conv = YoloConvLayer17()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)

    s = tvm.create_schedule(outputs.op)
    schedule_yolo_conv_cuda(s, outputs, inputs, weight, config)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    # print(stmt)
    dev_id = 0
    ctx = tvm.nd.context("cuda", dev_id)
    max_dims = ctx.max_thread_dimensions
    kwargs = {
        "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
        "max_threads_per_block": ctx.max_threads_per_block,
        "max_thread_x": max_dims[0],
        "max_thread_y": max_dims[1],
        "max_thread_z": max_dims[2]
    }
    verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs)
    print("%d. config is:\n %s" % (__COUNTER__, str(config)))
    if verify:
        print("Valid kernel")
        time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10)
        print("Yolo conv17 use", time_cost, "ms\n")
    else:
        print("Invalid kernel")
        time_cost = float("inf")
    return time_cost
Beispiel #41
0
def _get_gaussian_map_sum_tvm_mod():
    rows, cols = tvm.var('rows'), tvm.var('cols')  # the shape of output
    n = tvm.var('n')  # the number of samples
    data = tvm.placeholder((n, 3), name='data')
    ni = tvm.reduce_axis((0, n), name='ni')
    pi = tvm.const(np.pi)

    def _gaussian_map_sum(i, j):
        # i is row, j is col
        x, y = data[ni, 0], data[ni, 1]
        sigma = data[ni, 2]
        sigma2 = sigma * sigma
        v = tvm.if_then_else(
            tvm.all(x >= 0, x < cols, y >= 0, y < rows),
            tvm.exp(-(topi.power((x - j), 2) + topi.power(
                (y - i), 2)) / (2 * sigma2)) / (2 * pi * sigma2), 0)
        return tvm.sum(v, axis=ni)

    out = tvm.compute((rows, cols), _gaussian_map_sum, name='out')
    s = tvm.create_schedule(out.op)
    out_i = s[out].fuse(*out.op.axis)
    s[out].parallel(out_i)
    print(tvm.lower(s, [data], simple_mode=True))
    return tvm.build(s, [data, out])
Beispiel #42
0
 def check_device(target):
     num_step = n_num_step
     print(tvm.lower(s, [Xi2h, Wh2h, scan_h, scan_c], simple_mode=True))
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target)
     ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(size=(num_step, batch_size, 4,
                                      num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(size=(4, num_hidden,
                                      num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, ctx)
     scan_c_a = tvm.nd.array(scan_c_np, ctx)
     Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
     Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     # measure time cost of second step.
     evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
     eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     print("Time cost=%g" % eval_result.mean)
Beispiel #43
0
def schedule_gpu1_1(four):
    neuron_i, synapse, temp, neuron_n = four
    sch = tvm.create_schedule(neuron_n.op)

    b, n = sch[neuron_n].op.axis
    print(sch[neuron_n].op.axis)
    no, ni = sch[neuron_n].split(n, nparts=49)
    noo, noi = sch[neuron_n].split(no, nparts=7)
    nio, nii = sch[neuron_n].split(ni, nparts=16)
    sch[temp].compute_at(sch[neuron_n], nii)

    block_x = tvm.thread_axis("blockIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    block_z = tvm.thread_axis("blockIdx.z")
    thread_x = tvm.thread_axis((0, 8), "threadIdx.x")
    thread_y = tvm.thread_axis((0, 8), "threadIdx.y")
    thread_z = tvm.thread_axis((0, 8), "threadIdx.z")

    ro, ri = sch[temp].split(temp.op.reduce_axis[0], 4)
    roo, roi = sch[temp].split(ro, 4)
    sch[temp].vectorize(ri)
    sch[temp].unroll(roi)

    sch[neuron_n].bind(noo, block_y)
    sch[neuron_n].bind(noi, block_x)
    sch[neuron_n].bind(nio, thread_y)
    sch[neuron_n].bind(nii, thread_x)

    #sch[neuron_n].reorder(y, x, ky, kx, i, n, b)

    print(tvm.lower(sch, four, simple_mode=True))
    func = tvm.build(sch, [neuron_i, synapse, neuron_n], target='cuda')
    assert func
    print('GPU compilation done...')

    return func
Beispiel #44
0
def run_and_check(func, args, outs, var_dict={}, target='llvm'):
    def tvm_val_2_py_val(val):
        val = tvm.ir_pass.Substitute(val, var_dict)
        val = tvm.ir_pass.Simplify(val)
        assert isinstance(val, (tvm.expr.IntImm, tvm.expr.UIntImm))
        return val.value

    ctx = tvm.context(target, 0)

    emu_args = []
    nd_args = []
    to_check = []
    for i in args:
        if isinstance(i, tvm.tensor.Tensor):
            shape = [tvm_val_2_py_val(j) for j in i.shape]
            if i in outs:
                emu_args.append(numpy.zeros(shape).astype(i.dtype))
                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
                to_check.append((nd_args[-1], emu_args[-1]))
            else:
                emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
        else:
            assert isinstance(i, tvm.expr.Var)
            emu_args.append(tvm_val_2_py_val(i))
            nd_args.append(emu_args[-1])

    func(*emu_args)

    lowerd_func = tvm.lower(func(*args), args)
    module = tvm.build(lowerd_func, target=target)
    assert module
    module(*nd_args)

    for nd, np in to_check:
        numpy.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
def test_large_input():
    @tvm.hybrid.script
    def compute(a, b):
        n = 16384
        c = output_tensor((n, n), 'int32')
        for i in range(n):
            for j in range(n):
                c[i, j] = a[i, j] - b[i, j]
        return c

    n = 16384
    shape = (n, n)
    a = te.placeholder(shape, name='a', dtype='int32')
    b = te.placeholder(shape, name='b', dtype='int32')
    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
    c = te.compute(shape, lambda i, j: 1 + c[i, j])
    s = te.create_schedule(c.op)
    stmt = tvm.lower(s, [a, b, c], simple_mode=True)

    def verify(n):
        if isinstance(n, tvm.tir.Allocate):
            assert n.extents[0].value == 268435456

    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
Beispiel #46
0
def test_ib():
    print('aaaa')
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (16, )
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, name='a')
    w = shape[0]
    e = 16

    def build_nms_ir(ten_in, ten_out):
        ib = tvm.ir_builder.create()
        imm_value = 10
        ib.scope_attr(env.nnpu_axis, "coproc_scope", 0)
        p_in = ib.buffer_ptr(ten_in[0])
        p_out = ib.buffer_ptr(ten_out[0])
        #with ib.for_range(0,w, name="k") as k:
        with ib.for_range(0, w / e, name="i") as i:
            ib.emit(
                make_intrin_call(
                    "void", 'VAddI', ten_out[0].access_ptr("w", 'uint32') +
                    i * dtype_bytes(dtype_w),
                    ten_in[0].access_ptr("r", 'uint32') +
                    i * dtype_bytes(dtype_w), tvm.const(imm_value, 'float64'),
                    env.cfg['vector_unit']['size'], 3))
        stmt = ib.get()
        return stmt

    sph = ScheduleProcHelper()
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    sph.MarkScope(a_buf)
    out = tvm.extern(a_buf.shape, [a_buf],
                     build_nms_ir,
                     in_buffers=[
                         tvm.decl_buffer(a_buf.shape,
                                         dtype_w,
                                         data_alignment=dtype_bytes(dtype_w),
                                         scope='local.nnpu_scratchpad0')
                     ],
                     out_buffers=[
                         tvm.decl_buffer(a_buf.shape,
                                         dtype_w,
                                         data_alignment=dtype_bytes(dtype_w),
                                         scope='local.nnpu_scratchpad0')
                     ],
                     dtype=dtype_w,
                     name="test_ir")
    sph.MarkScope(out)
    out_host, out_dram = nnpu.utils.CopyBufToH(out, 'out', sph)
    s = tvm.create_schedule([out_host.op])
    sph.Transform(s)
    print(tvm.lower(s, [a, out_host], simple_mode=True))
    print(nnpu.lower(s, [a, out_host], simple_mode=True))
    # exit(0)
    func = nnpu.build(s, [a, out_host], 'nnpu', 'llvm', name='nnpu_test')
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(16, ), dtype=a.dtype, low=0, high=127)
    a_nd = tvm.nd.array(a_np, ctx)

    b_nd = tvm.nd.array(np.zeros(16, ).astype(out_host.dtype), ctx)

    func(a_nd, b_nd)

    print('a = ')
    print(a_np)
    print('xjb sum = ')
    print(b_nd.asnumpy())
    return
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)

################################################################################
# Inspecting the Optimized Schedule
# ---------------------------------
# We can lower the schedule to see the IR after auto-scheduling.  The
# auto-scheduler correctly performs optimizations including multi-level tiling,
# layout transformation, parallelization, vectorization, unrolling, and
# operator fusion.

print("Lowered TIR:")
print(tvm.lower(sch, args, simple_mode=True))

################################################################################
# Check correctness and evaluate performance
# ------------------------------------------
# We build the binary and check its correctness and performance.

func = tvm.build(sch, args, target)
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.random.uniform(size=(N, M)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np

dev = tvm.cpu()
a_tvm = tvm.nd.array(a_np, device=dev)
b_tvm = tvm.nd.array(b_np, device=dev)
import tvm
from tvm import te
import numpy as np

######################################################################
# We first write a very simple vector add and build it with the default schedule. Then, we use
# our customized lowering pass to manipulate the IR directly instead of using schedule primitives.
#

n = tvm.tir.const(128, "int32")
a = te.placeholder((n, ), name="a")
b = te.placeholder((n, ), name="b")
c = te.compute((n, ), lambda i: a[i] + b[i], name='c')

sch = te.create_schedule(c.op)
ir = tvm.lower(sch, [a, b, c], simple_mode=True)
print(ir)

######################################################################
# Writing a Pass
# --------------
# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement.
# Thus, we define this vectorize function and implement it step by step.
#

######################################################################
# TVM already provides two class for users to both analyze and transform IR.
#
# IR Visitor
# ~~~~~~~~~~
# We can use ``tvm.tir.ir_pass.PostOrderVisit(stmt, func)`` to gather information from the Halide IR.
Beispiel #49
0
import tvm

n = 1024
m = 1024
A = tvm.placeholder((n, m), name='A')
k = tvm.reduce_axis((0, n), name='k')
l = tvm.reduce_axis((0, m), name = 'l')

B = tvm.compute((n,), lambda i: tvm.sum(A[i, l], axis=l), name='B')

s = tvm.create_schedule(B.op)

ko, ki = s[B].split(B.op.reduce_axis[0], factor=4)

print(tvm.lower(s, [A, B], simple_mode=True))
print("---------cutting line---------")

s[B].pragma(ki, "unroll")

print(tvm.lower(s, [A, B], simple_mode=True))
Beispiel #50
0
#b_shared = sch.cache_read(b, 'shared', [c_acc])
b_shared = b
b_frag = sch.cache_read(b_shared, 'wmma.matrix_b', [c_acc])
sch[b_frag].compute_at(sch[c_acc], c_rio)
bxo, bxi = sch[b_frag].split(sch[b_frag].op.axis[0], 16)
byo, byi = sch[b_frag].split(sch[b_frag].op.axis[1], 16)
sch[b_frag].reorder(bxo, byo, bxi, byi)
sch[b_frag].pragma(bxo, 'tensorize', 'tensorcore.load_b')
#sch[b_shared].compute_at(sch[c_acc], c_roi)

import tensorizer
with tvm.transform.PassContext(
        opt_level=4, config={'tir.add_lower_pass': [(1, tensorizer.rewrite)]}):
    #with tvm.transform.PassContext(opt_level=4):
    ir = tvm.lower(sch, [a, b, c])
    module = tvm.build(sch, [a, b, c], 'nvptx')
    print(ir)

#print(module.imported_modules[0].get_source())
np_a = np.random.randn(n, k).astype('float16')
np_b = np.random.randn(k, m).astype('float16')
np_c = np.random.randn(n, m).astype('float32')

#np_a = np.ones((n, k)).astype('float16')
#np_b = np.ones((k, m)).astype('float16')
#np_c = np.ones((n, m)).astype('float32')

#np_a = np.array(np.array(list(range(k)) * n) % 3).astype('float16')
#np_a.shape = (n, k)
#np_b = np.array(np.array(list(range(k)) * m) % 3).astype('float16')
Beispiel #51
0
from __future__ import absolute_import, print_function
import tvm
import topi
import numpy as np

if __name__ == '__main__':
    x, y = 100, 10
    a = tvm.placeholder((x, y, y), name='a')
    b = tvm.placeholder((y, y), name='b')
    c = a + b
    d = a * b

    e = topi.elemwise_sum([c, d])
    f = e / 2.0
    g = topi.sum(f)
    with tvm.target.cuda():
        sg = topi.generic.schedule_reduce(g)
        print(tvm.lower(sg, [a, b], simple_mode=True))
Beispiel #52
0
# -------------
# Let's revisit the sum of rows operation (equivalent to :code:`B = numpy.sum(A, axis=1)`') \
# To compute the sum of rows of a two dimensional TVM tensor A, we should
# specify the symbolic operation as well as schedule as follows
#
n = tvm.var("n")
m = tvm.var("m")
A = tvm.placeholder((n, m), name='A')
k = tvm.reduce_axis((0, m), "k")
B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
s = tvm.create_schedule(B.op)

######################################################################
# and to examine the IR code in human readable format, we can do
#
print(tvm.lower(s, [A], simple_mode=True))

######################################################################
# However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
# :code: `tvm.compute`. Imagine for more complicated operations how much details we need to provide.
# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code`numpy.sum`
#
C = topi.sum(A, axis=1)
ts = tvm.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))

######################################################################
# Numpy-style operator overloading
# --------------------------------
# We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes.
# Even shorter, TOPI provides operator overloading for such common operations. For example,
Beispiel #53
0
def test_gemm_gpu(N, times, bn, num_block, num_thread):
    assert (bn <= N)
    assert (num_thread * num_thread * 16 <= N)
    assert (num_block * num_block * 2 <= N)
    A = tvm.placeholder((N, N), name='A')
    B = tvm.placeholder((N, N), name='Btmp')
    k = tvm.reduce_axis((0, N), name='k')

    packedB = tvm.compute((N, N / bn, bn),
                          lambda x, y, z: B[x, y * bn + z],
                          name='B')

    C = tvm.compute((N, N),
                    lambda ii, jj: tvm.sum(
                        A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k),
                    name='C')

    s = tvm.create_schedule(C.op)
    CC = s.cache_write(C, "local")

    block_x = tvm.thread_axis("blockIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_x = tvm.thread_axis("threadIdx.x")
    thread_y = tvm.thread_axis("threadIdx.y")

    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")

    pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread)
    pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread)
    s[packedB].bind(pby, thread_y)
    s[packedB].bind(pbx, thread_x)
    pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8)
    s[packedB].vectorize(pbk)

    by, yi = s[C].split(C.op.axis[0], nparts=num_block)
    bx, xi = s[C].split(C.op.axis[1], nparts=num_thread)

    s[C].bind(by, block_y)
    s[C].bind(bx, thread_y)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_block)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)

    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)

    s[C].bind(ty, block_x)
    s[C].bind(tx, thread_x)

    xyi, xxi = s[C].split(xi, factor=8)
    s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi)
    s[C].vectorize(xxi)

    s[CC].compute_at(s[C], yi)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)
    xo, xi = s[CC].split(xo, factor=8)
    s[CC].vectorize(xi)

    ko, ki = s[CC].split(k, factor=2)
    s[CC].unroll(ki)

    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
    temp = util.tempdir()
    path_dso = temp.relpath("gemm_gpu.so")
    f.export_library(path_dso, ndk.create_shared)

    # connect to the proxy
    remote = rpc.connect(proxy_host, proxy_port, key=key)
    ctx = remote.cl(0)
    remote.upload(path_dso)
    f = remote.load_module("gemm_gpu.so")

    evaluate(f, ctx, N, times)
Beispiel #54
0
    dispatch_context = autotvm.apply_history_best(log_file)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
else:
    config = task.config_space.get(PRETUNED_INDEX)
    dispatch_context = autotvm.task.ApplyConfig(config)
    print("Using pretuned config:")
    print(config)

with dispatch_context:
    with tvm.target.create("cuda"):
        s, arg_bufs = conv2d(N, H, W, CO, CI, KH, KW, strides, padding,
                             scaling_factor)
        print(tvm.lower(s, arg_bufs, simple_mode=True))
        func = tvm.build(s, arg_bufs)
        print(func.imported_modules[0].get_source())

# check correctness
a_np = np.random.randint(size=(N, CI // BI, H, W, BI),
                         low=-128,
                         high=127,
                         dtype='int8')
w_np = np.random.randint(size=(CO // BO, CI // BI, KH, KW, BO, BI),
                         low=-128,
                         high=127,
                         dtype='int8')
a_np_ = a_np.transpose((0, 1, 4, 2, 3)).ravel().reshape(N, CI, H, W)
w_np_ = w_np.transpose((0, 4, 1, 5, 2, 3)).ravel().reshape(CO, CI, KH, KW)
#c_np = conv2d_nchw_python(a_np_, w_np_, strides, padding).astype('int8')
Beispiel #55
0
        output = op.output(0)
        conv_out = op.input_tensors[0]

        kernel_pack = conv_out.op.input_tensors[1]
        kernel = kernel_pack.op.input_tensors[0]

        data_vec = conv_out.op.input_tensors[0]
        data = data_vec.op.input_tensors[0]
        data_pad = None
        if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.name:
            data_pad = data
            data = data_pad.op.input_tensors[0]

        s = _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_pack,
                           conv_out, output, output)
        print(tvm.lower(s, [A, W, Conv], simple_mode=True))
        conv_unpack = tvm.nd.array(
            np.zeros(get_const_tuple(Conv.shape), dtype=dtype), ctx)
        func = tvm.build(s, [A, W, Conv], device)
        time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
        cost_unpack = time_f(tvm.nd.array(a_np), tvm.nd.array(w_np),
                             conv_unpack).mean
        print('conv: %g ms/op' % (cost_unpack * 1000.0))

# W0
# batch_size, in_channel, in_size, num_filter, kernel_size, stride, padding = 1, 3, 224, 64, 7, 2, 3
# ic_bn, oc_bn, ur_w = 3, 16, 28
# verify(1, 3, 224, 64, 7, 2, 3)

# W1
# batch_size, in_channel, in_size, num_filter, kernel_size, stride, padding = 1, 64, 56, 64, 3, 1, 1
def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width,
                  in_filter, out_filter, k_h, k_w, hpad, wpad, hstride,
                  wstride):
    """
    Runs the inference and checks the functional correctness between
    compute and schedule outputs
    """
    (data_shape, kernel_shape,
     o_shape) = get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w,
                          hpad, wpad, hstride, wstride, out_dtype)

    # Create TVM placeholders
    data = te.placeholder(data_shape, name='data', dtype=data_dtype)
    kernel = te.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)

    # Create the numpy arrays to be used for executing conv models
    if data_dtype == 'float32':
        data_array = tvm.nd.array(
            np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
        kernel_array = tvm.nd.array(
            np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
    else:
        data_array = tvm.nd.array(
            np.random.randint(100, size=data_shape).astype(data_dtype))
        kernel_array = tvm.nd.array(
            np.random.randint(100, size=kernel_shape).astype(kernel_dtype))

    # c_orig will be used for declaration ouptut
    # c_sch will be used for scheduled computation output
    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)

    with tvm.target.Target(TARGET_NAME):
        conv = topi.nn.conv2d_NCHWc(data,
                                    kernel,
                                    stride=hstride,
                                    padding=hpad,
                                    dilation=(1, 1),
                                    layout='NCHWc',
                                    out_layout='NCHWc',
                                    out_dtype=out_dtype)
        out = topi.nn.relu(conv)
        sch = te.create_schedule(out.op)
        func = tvm.build(sch, [data, kernel, out],
                         target=TARGET_NAME,
                         name='out')
        func(data_array, kernel_array, c_orig)
        LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))

        # Generate and run the optimized schedule
        sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
        func = tvm.build(sconv, [data, kernel, out],
                         target=TARGET_NAME,
                         name='conv')
        func(data_array, kernel_array, c_sch)

        # Functional check
        if data_dtype == 'uint8':
            np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
        else:
            assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())

        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
        LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
        return evaluator(data_array, kernel_array, c_sch).mean
Beispiel #57
0
                        type=str,
                        default=None,
                        dest='cuda_arch',
                        help='The cuda arch for compiling kernels for')
    arguments = parser.parse_args()

    func_list_llvm = []
    func_list_cuda = []

    # TODO: attach instruction features to the library, e.g., avx-512, etc.
    for operator_def in __OP_DEF__:
        for sch, args, name in operator_def.invoke_all():
            if tvm.module.enabled(get_target(operator_def.target)):
                func_list = func_list_llvm if operator_def.target == "cpu" else func_list_cuda
                func_lower = tvm.lower(sch,
                                       args,
                                       name=name,
                                       binds=operator_def.get_binds(args))
                func_list.append(func_lower)

    lowered_funcs = {get_target("cpu"): func_list_llvm}
    if len(func_list_cuda) > 0:
        lowered_funcs[get_target("cuda")] = func_list_cuda
        cuda_arch = get_cuda_arch(arguments.cuda_arch)
        if cuda_arch is None:
            logging.info(
                'No cuda arch specified. TVM will try to detect it from the build platform.'
            )
        else:
            logging.info(
                'Cuda arch {} set for compiling TVM operator kernels.'.format(
                    cuda_arch))
Beispiel #58
0
                 name="res")

######################################################################
# Scheduling the Computation
# --------------------------
# We'll look at a set of schedule transformations necessary to map the
# matrix multiplications onto VTA in an efficient fashion.
# Those include:
#
# - Computation blocking
# - Lowering to VTA hardware intrinsics

# Create TVM schedule
s = te.create_schedule(res.op)
# Let's look at the default TVM schedule
print(tvm.lower(s, [data, weight, res], simple_mode=True))

######################################################################
# Blocking the Computation
# ~~~~~~~~~~~~~~~~~~~~~~~~
# The matrix multiplication is by default too large for activations or weights
# to fit on VTA's on-chip buffers all at once.
# We block the (1, 1024) by (1024, 1024) matrix multiplication into
# smaller (1, 256) by (256, 256) matrix multiplications so the intermediate
# tensors can fit on the accelerator's on-chip SRAM.
# This approach is similar to blocking techniques applied to CPUs and GPUs in
# order to increase cache hit rate.
#
# We perform blocking along each axes (the batch axis being untouched since
# we are performing singe-batch inference).
# We also leave the inner-most tensorization axes as-is in order to allow
func = tvm.build(sg, [a, b, g], 'cuda')
ctx = tvm.gpu(0)
a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)

a_nd = tvm.nd.array(a_np, ctx)
b_nd = tvm.nd.array(b_np, ctx)
g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
func(a_nd, b_nd, g_nd)
np.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)

# common neural nets
tarray = tvm.placeholder((512, 512), name="tarray")
softmax_topi = topi.nn.softmax(tarray)
with tvm.target.cuda():
    sst = topi.generic.schedule_softmax(softmax_topi)
    # print(tvm.lower(sst, [tarray], simple_mode=True))

# fusing conv
# fuse topi.nn.conv2d and topi.nn.relu together
data = tvm.placeholder((1, 3, 224, 224))
kernel = tvm.placeholder((10, 3, 5, 5))
conv = topi.nn.conv2d(data, kernel, strides=1, padding=2)
out = topi.nn.relu(conv)
with tvm.target.create('cuda'):
    # 难道每种操作都有一个专门的调度
    sconv = topi.generic.nn.schedule_conv2d_nchw(out)
    print(tvm.lower(sconv, [data, kernel], simple_mode=True))
Beispiel #60
0
s = tvm.te.reduce_axis([0, S], "s")

D = tvm.te.compute(
    [P, Q],
    lambda i, j: tvm.te.sum(A[i * R + r, j * S + s] * C[i * R + r, j * S + s],
                            axis=[r, s]),
    name="D",
    requires_grad=True)

E = mse_loss(D, label)

dA, = tvm.te.mygradient(E, [A])

s = tvm.te.create_schedule([E.op, dA.op])

print(tvm.lower(s, [A, label, E, dA], simple_mode=True))

func = tvm.build(s, [A, label, E, dA], target="llvm")

A_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
label_np = np.random.uniform(-10, 10, [P, Q]).astype("float32")
E_np = np.zeros([1]).astype("float32")
dA_np = np.zeros([H, W]).astype("float32")

ctx = tvm.context("llvm", 0)
A_tvm = tvm.nd.array(A_np, ctx)
label_tvm = tvm.nd.array(label_np, ctx)
E_tvm = tvm.nd.array(E_np, ctx)
dA_tvm = tvm.nd.array(dA_np, ctx)

func(A_tvm, label_tvm, E_tvm, dA_tvm)