コード例 #1
0
ファイル: test_pass_annotation.py プロジェクト: bddppq/tvm
def test_annotate_none():
    ctx1 = tvm.context(1)
    ctx2 = tvm.context(2)
    x = relay.var("x", shape=(3,))
    y = relay.var("y", shape=(3,))
    z = relay.var("z", shape=(3,))

    def annotated():
        add = relay.add(x, y)
        sub = relay.subtract(add, z)
        func = relay.Function([x, y, z], sub)
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.rewrite_annotated_ops(func,
                                                   ctx1.device_type)
        return func

    def expected():
        add = relay.add(x, y)
        sub = relay.subtract(add, z)
        func = relay.Function([x, y, z], sub)
        return func

    annotated_func = relay.ir_pass.infer_type(annotated())
    expected_func = relay.ir_pass.infer_type(expected())
    assert relay.ir_pass.alpha_equal(annotated_func, expected_func)
コード例 #2
0
ファイル: test_pass_annotation.py プロジェクト: bddppq/tvm
def test_annotate_all():
    ctx1 = tvm.context(1)
    ctx2 = tvm.context(2)
    x = relay.var("x", shape=(3,))
    y = relay.var("y", shape=(3,))
    z = relay.var("z", shape=(3,))

    def annotated():
        add = relay.add(x, y)
        _add = relay.annotation.on_device(add, ctx2)
        sub = relay.subtract(add, z)
        _sub = relay.annotation.on_device(sub, ctx2)

        func = relay.Function([x, y, z],
                              relay.Tuple(tvm.convert([_add, _sub,
                                                       sub])))
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.rewrite_annotated_ops(func,
                                                   ctx1.device_type)
        func = relay.ir_pass.infer_type(func)
        return relay.Function(relay.ir_pass.free_vars(func.body[2]),
                              func.body[2])

    def expected():
        add = relay.add(x, y)
        sub = relay.subtract(add, z)
        func = relay.Function([x, y, z], sub)
        return func

    annotated_func = relay.ir_pass.infer_type(annotated())
    expected_func = relay.ir_pass.infer_type(expected())
    assert relay.ir_pass.alpha_equal(annotated_func, expected_func)
コード例 #3
0
def test_compile_engine():
    engine = relay.backend.compile_engine.get()
    def get_func(shape):
        x = relay.var("x", shape=shape)
        y = relay.add(x, x)
        z = relay.add(y, x)
        f = relay.ir_pass.infer_type(relay.Function([x], z))
        return f
    z1 = engine.lower(get_func((10,)), "llvm")
    z2 = engine.lower(get_func((10,)), "llvm")
    z3 = engine.lower(get_func(()), "llvm")
    assert z1.same_as(z2)
    assert not z3.same_as(z1)
    if tvm.context("cuda").exist:
        z4 = engine.lower(get_func(()), "cuda")
        assert not z3.same_as(z4)

    # Test JIT target
    for target in ["llvm"]:
        ctx = tvm.context(target)
        if ctx.exist:
            f = engine.jit(get_func((10,)), target)
            x = tvm.nd.array(np.ones(10).astype("float32"), ctx=ctx)
            y = tvm.nd.empty((10,), ctx=ctx)
            f(x, y)
            tvm.testing.assert_allclose(
                y.asnumpy(), x.asnumpy() * 3)
    engine.dump()
コード例 #4
0
ファイル: test_pass_annotation.py プロジェクト: bddppq/tvm
    def test_fuse_all(device, tgt):
        """Fuse all operators."""
        fallback_device = tvm.context("cpu")
        target = {"cpu": "llvm", device: tgt}
        cpu_ctx = fallback_device
        dev_ctx = tvm.context(device)

        def annotated():
            add = relay.add(x, y)
            _add = relay.annotation.on_device(add, dev_ctx)
            sqrt = relay.sqrt(add)
            _sqrt = relay.annotation.on_device(sqrt, dev_ctx)
            log = relay.log(add)
            _log = relay.annotation.on_device(log, dev_ctx)
            subtract = relay.subtract(sqrt, log)
            _subtract = relay.annotation.on_device(subtract, dev_ctx)
            exp = relay.exp(subtract)
            _exp = relay.annotation.on_device(exp, dev_ctx)

            func = relay.Function([x, y],
                                  relay.Tuple(tvm.convert([_add, _sqrt, _log,
                                                           _subtract, _exp,
                                                           exp])))
            func = relay.ir_pass.infer_type(func)
            func = relay.ir_pass.rewrite_annotated_ops(func,
                                                       cpu_ctx.device_type)
            func = relay.ir_pass.infer_type(func)
            return relay.Function(relay.ir_pass.free_vars(func.body[5]),
                                  func.body[5])

        annotated_func = annotated()
        expected_func = get_func()
        check_annotated_graph(annotated_func, expected_func)
        test_runtime(target, device, annotated_func, fallback_device)
コード例 #5
0
ファイル: test_sparse.py プロジェクト: bddppq/tvm
def test_dynamic_tensor():
    dtype = 'float32'
    stype = 'csr'
    target = 'llvm'
    ctx = tvm.context(target, 0)
    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
    assert(A.stype == 'csr')
    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
    s = tvm.create_schedule(C.op)
    _nr, _nc = 3, 5
    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
    a = tvmsp.array(a, ctx)
    assert a.data.dtype == a.dtype
    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
    binds = {A.data: Ab.data, A.indices: Ab.indices}
    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
    c.data = tvm.nd.empty(a.data.shape, dtype)
    c.indices = a.indices
    c.indptr = a.indptr
    f(a.data.shape[0], a.data, c.data)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
コード例 #6
0
def test_double_splitting_with_indivisible_factors():
    m = 48
    dtype="float32"
    A = tvm.placeholder((m,), name='A', dtype=dtype)
    C = tvm.compute((m,), lambda i: A[i], name='C')
    D = tvm.compute((m,), lambda i: C[i], name='D')

    s = tvm.create_schedule(D.op)
    co, ci = s[C].split(C.op.axis[0], factor=10)
    do, di = s[D].split(D.op.axis[0], 32)
    s[C].compute_at(s[D], do)

    target = 'llvm'
    with tvm.build_config(partition_const_loop=True):
        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
        func = tvm.build(f, target=target)

    # Find the beginning of the Halide IR corresponding to kernel code
    # and make sure it doesn't have an if statements left
    top_produce = find_top_produce(f.body)
    assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse))))

    # check functional correctness of generated code
    ctx = tvm.context(target, 0)
    a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx)
    c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    func(a, c, d)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
    tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
コード例 #7
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % device)
        with tvm.target.create(device):
            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.generic.schedule_group_conv2d_nchw([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\
                (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \
            (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
コード例 #8
0
ファイル: test_topi_broadcast.py プロジェクト: gwli/tvm
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     print("Running on target: %s" % device)
     target = topi.cpp.TEST_create_target(device)
     s = topi.cpp.cuda.schedule_injective(target, [C])
     ctx = tvm.context(device, 0)
     foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
     lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
     rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
     if typ == "add":
         out_npy = lhs_npy + rhs_npy
     elif typ == "sub":
         out_npy = lhs_npy - rhs_npy
     elif typ == "div":
         rhs_npy = np.abs(rhs_npy) + 0.001
         out_npy = lhs_npy / rhs_npy
     elif typ == "mul":
         out_npy = lhs_npy * rhs_npy
     elif typ == "maximum":
         out_npy = np.maximum(lhs_npy, rhs_npy)
     elif typ == "minimum":
         out_npy = np.minimum(lhs_npy, rhs_npy)
     elif typ == "pow":
         out_npy = lhs_npy ** rhs_npy
     else:
         raise NotImplementedError
     lhs_nd = tvm.nd.array(lhs_npy, ctx)
     rhs_nd = tvm.nd.array(rhs_npy, ctx)
     out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
     for _ in range(1):
         foo(lhs_nd, rhs_nd, out_nd)
     np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
コード例 #9
0
ファイル: test_topi_vision.py プロジェクト: bddppq/tvm
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            if device == 'llvm':
                out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
                indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
            else:
                out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
            s = topi.generic.schedule_nms(out)
            indices_s = topi.generic.schedule_nms(indices_out)

        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)

        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f = tvm.build(s, [data, valid_count, out], device)
        f(tvm_data, tvm_valid_count, tvm_out)
        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)

        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
        f = tvm.build(indices_s, [data, valid_count, indices_out], device)
        f(tvm_data, tvm_valid_count, tvm_indices_out)
        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
コード例 #10
0
ファイル: test_topi_conv2d_NCHWc.py プロジェクト: bddppq/tvm
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
                                     (dilation, dilation),
                                     layout='NCHW%dc'%ic_block,
                                     out_layout="NCHW%dc"%oc_block,
                                     out_dtype=dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.generic.schedule_conv2d_NCHWc([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            func = tvm.build(s, [A, W, bias, C], device,
                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                                  (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C], device,
                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                                  (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
コード例 #11
0
def test_multiple_kernels():
    N = 1024

    A = tvm.placeholder((N, N), name='A')
    B = tvm.compute((N, N), lambda i, j: A[i, j])
    C = tvm.compute((N, N), lambda i, j: B[i, j])

    s = tvm.create_schedule([C.op])

    s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x"))
    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))

    # shared memory usage: 0
    # thread usage: N

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N - 1))]}):
            tvm.build(s, [A, C], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
                                max_threads_per_block=N))]}):
            tvm.build(s, [A, C], target)
        assert valid[0]
コード例 #12
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_elemwise(B)

        k_ = 2
        foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ)
        a_npy = np.random.uniform(size=shape).astype(A.dtype)
        if typ == "add":
            b_npy = a_npy + k_
        elif typ == "sub":
            b_npy = a_npy - k_
        elif typ == "mul":
            b_npy = a_npy * k_
        elif typ == "div":
            b_npy = a_npy / k_
        else:
            raise NotImplementedError()

        a_nd = tvm.nd.array(a_npy, ctx)
        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
        foo(a_nd, b_nd, k_, *shape)
        tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
コード例 #13
0
def test_local_memory():
    N = 1024
    M = 128

    A = tvm.placeholder((N,), name='A', dtype='float32')
    B = tvm.compute((N, ), lambda i: A[i], name='B')

    s = tvm.create_schedule([B.op])
    AA = s.cache_read(A, "local", [B])
    o, i = s[B].split(s[B].op.axis[0], M)
    s[AA].compute_at(s[B], o)
    s[B].bind(o, tvm.thread_axis("blockIdx.x"))

    # local memory usage: M * 4B
    # thread usage: M

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_local_memory_per_block=4 * M - 1,
                                max_threads_per_block=1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_local_memory_per_block=4 * M,
                                max_threads_per_block=1))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]
コード例 #14
0
ファイル: cuda_gemm_square.py プロジェクト: LANHUIYING/tvm
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Device %s" % device)
        f = tvm.build(s, [A, B, C], device)
        # launch the kernel.
        n, m, l = nn, nn, nn
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        for i in range(2):
            f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)

        num_flops = 2 * nn * nn * nn
        num_runs = 10
        timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
        t = timer_f(a, b, c).mean
        GFLOPS = num_flops / (t * 1e3) / 1e6
        print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
コード例 #15
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_broadcast(C)

        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
        lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
        rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
        if typ == "add":
            out_npy = lhs_npy + rhs_npy
        elif typ == "sub":
            out_npy = lhs_npy - rhs_npy
        elif typ == "mul":
            out_npy = lhs_npy * rhs_npy
        elif typ == "div":
            rhs_npy = np.abs(rhs_npy) + 0.001
            out_npy = lhs_npy / rhs_npy
        else:
            raise NotImplementedError()

        lhs_nd = tvm.nd.array(lhs_npy, ctx)
        rhs_nd = tvm.nd.array(rhs_npy, ctx)
        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
        for _ in range(1):
            foo(lhs_nd, rhs_nd, out_nd)
        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
コード例 #16
0
ファイル: test_module_load.py プロジェクト: gwli/tvm
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        temp = util.tempdir()
        name = "myadd_%s" % device
        if sys.platform == "darwin" or sys.platform.startswith('linux'):
            f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name)
        elif sys.platform == "win32":
            f = tvm.build(s, [A, B], device, "llvm", name=name)
        else:
            raise ValueError("Unsupported platform")

        path_dso = temp.relpath("dev_lib.so")
        f.export_library(path_dso)

        f1 = tvm.module.load(path_dso)
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        f1(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
        if sys.platform != "win32":
            f2 = tvm.module.system_lib()
            f2[name](a, b)
            np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
コード例 #17
0
ファイル: onnx.py プロジェクト: bddppq/tvm
    def _impl_v1(cls, inputs, attr, params):
        if 'shape' in attr:
            return _op.reshape(inputs[0], attr['shape'])

        if get_name(inputs[1]) in params:
            shape = tuple(params[inputs[1].name_hint].asnumpy())
            out = _op.reshape(inputs[0], shape)
        else:
            # Try to infer shape by precompute prune if possible.
            # TODO: good to check inputs to be in params.
            #       to be enhanced when relay support list_input_names API of NNVM
            logging.warning("Infering Reshape argument by precompute")
            func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
            with tvm.relay.build_config(opt_level=0):
                graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
            ctx = tvm.context("llvm", 0)
            from tvm.contrib import graph_runtime
            m = graph_runtime.create(graph, lib, ctx)
            m.set_input(**params)
            m.run()
            params_new = m.get_output(0)
            inputs.pop(1)
            out = _op.reshape(inputs[0], tuple(params_new.asnumpy().astype('int32').flatten()))

        return out
コード例 #18
0
ファイル: test_topi_broadcast.py プロジェクト: LANHUIYING/tvm
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_broadcast(C)
        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__)
        if lhs_shape is None:
            lhs_npy = float(np.random.uniform(low=lhs_min, high=lhs_max))
            if dtype.startswith('int'):
                lhs_npy = int(lhs_npy)
            lhs_nd = lhs_npy
        else:
            lhs_npy = np.random.uniform(low=lhs_min, high=lhs_max,
                                        size=lhs_shape).astype(A.dtype)
            lhs_nd = tvm.nd.array(lhs_npy, ctx)

        if rhs_shape is None:
            rhs_npy = float(np.random.uniform(low=rhs_min, high=rhs_max))
            if dtype.startswith('int'):
                rhs_npy = int(rhs_npy)
            rhs_nd = rhs_npy
        else:
            rhs_npy = np.random.uniform(low=rhs_min, high=rhs_max,
                                        size=rhs_shape).astype(A.dtype)
            rhs_nd = tvm.nd.array(rhs_npy, ctx)

        out_npy = fnumpy(lhs_npy, rhs_npy)
        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
        foo(lhs_nd, rhs_nd, out_nd)
        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
コード例 #19
0
ファイル: test_pass_annotation.py プロジェクト: bddppq/tvm
    def annotated():
        conv2d_1 = relay.nn.conv2d(
            data1,
            weight,
            channels=64,
            kernel_size=(3, 3),
            padding=(1, 1))
        _conv2d_1 = relay.annotation.on_device(conv2d_1, dev2)
        conv2d_2 = relay.nn.conv2d(
            data2,
            weight,
            channels=64,
            kernel_size=(3, 3),
            padding=(1, 1))
        _conv2d_2 = relay.annotation.on_device(conv2d_2, dev2)
        add = relay.add(conv2d_1, conv2d_2)
        _add = relay.annotation.on_device(add, dev1)
        conv2d_3 = relay.nn.conv2d(
            add,
            weight,
            channels=64,
            kernel_size=(3, 3),
            padding=(1, 1))
        _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2)

        func = relay.Function([data1, data2, weight],
                              relay.Tuple(tvm.convert([_conv2d_1, _conv2d_2,
                                                       _conv2d_3, _add,
                                                       conv2d_3])))
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.rewrite_annotated_ops(func,
                                                   tvm.context(3).device_type)
        func = relay.ir_pass.infer_type(func)
        return relay.Function(relay.ir_pass.free_vars(func.body[4]),
                              func.body[4])
コード例 #20
0
ファイル: config.py プロジェクト: bddppq/tvm
def ctx_list():
    """Get context list for testcases"""
    device_list = os.environ.get("NNVM_TEST_TARGETS", "")
    device_list = (device_list.split(",") if device_list
                   else ["llvm", "cuda"])
    device_list = set(device_list)
    res = [(device, tvm.context(device, 0)) for device in device_list]
    return [x for x in res if x[1].exist]
コード例 #21
0
ファイル: test_ewise.py プロジェクト: gwli/tvm
 def check_device(device, host="stackvm"):
     if not tvm.module.enabled(host):
         return
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         return
     fexp = tvm.build(s, [A, B],
                      device, host,
                      name="myexp")
     ctx = tvm.context(device, 0)
     # launch the kernel.
     n = 1024
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
     fexp(a, b)
     np.testing.assert_allclose(
         b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
コード例 #22
0
ファイル: test_pass_annotation.py プロジェクト: bddppq/tvm
    def test_fuse_log_add(device, tgt):
        """ Only log and add are fused."""
        fallback_device = tvm.context("cpu")
        target = {"cpu": "llvm", device: tgt}
        cpu_ctx = fallback_device
        dev_ctx = tvm.context(device)

        def annotated():
            add = relay.add(x, y)
            sqrt = relay.sqrt(add)
            _sqrt = relay.annotation.on_device(sqrt, dev_ctx)
            log = relay.log(add)
            subtract = relay.subtract(sqrt, log)
            exp = relay.exp(subtract)
            _exp = relay.annotation.on_device(exp, dev_ctx)

            func = relay.Function([x, y],
                                  relay.Tuple(tvm.convert([_sqrt, _exp, exp])))
            func = relay.ir_pass.infer_type(func)
            func = relay.ir_pass.rewrite_annotated_ops(func,
                                                       cpu_ctx.device_type)
            func = relay.ir_pass.infer_type(func)
            return relay.Function(relay.ir_pass.free_vars(func.body[2]),
                                  func.body[2])

        def expected():
            add = relay.add(x, y)
            copy_add_sqrt = relay.device_copy(add, cpu_ctx, dev_ctx)
            sqrt = relay.sqrt(copy_add_sqrt)
            log = relay.log(add)
            copy_sqrt_subtract = relay.device_copy(sqrt, dev_ctx, cpu_ctx)
            subtract = relay.subtract(copy_sqrt_subtract, log)
            copy_sub_exp = relay.device_copy(subtract, cpu_ctx, dev_ctx)
            exp = relay.exp(copy_sub_exp)

            func = relay.Function([x, y], exp)
            return func

        annotated_func = annotated()
        expected_func = expected()
        ctx = tvm.context(device, 0)
        dev_idx = ctx.device_type
        expected_index = [1, 1, 1, dev_idx, dev_idx, 1, 1, dev_idx, dev_idx]
        check_annotated_graph(annotated_func, expected_func)
        test_runtime(target, device, annotated_func, fallback_device,
                     expected_index)
コード例 #23
0
def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
    if mod is None:
        mod = relay.Module()

    ctx = tvm.context("llvm", 0)
    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")

    result = intrp.evaluate(expr)(*args)
    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
コード例 #24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--target', type=str, required=True,
                        choices=['cuda', 'rocm', 'opencl', 'metal'],
                        help="Compilation target.")
    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
    parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
    args = parser.parse_args()
    opt_level = args.opt_level
    num_iter = args.num_iter
    ctx = tvm.context(args.target, 0)
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size,) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(
            batch_size=1, image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    if args.target == "cuda":
        unroll = 1400
    else:
        unroll = 128
    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.build_config(auto_unroll_max_step=unroll,
                              unroll_explicit=(args.target != "cuda")):
            graph, lib, params = nnvm.compiler.build(
                net, args.target, shape={"data": data_shape}, params=params)

    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(args.repeat):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding device overheat
        if i + 1 != args.repeat:
            time.sleep(45)
コード例 #25
0
ファイル: test_ewise.py プロジェクト: bddppq/tvm
 def check_device(device, host="stackvm"):
     if not tvm.module.enabled(host):
         return
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         return
     func = tvm.build(s, [A0, A1, C],
                      device, host,
                      name="multiple_cache_write")
     ctx = tvm.context(device, 0)
     # launch the kernel.
     n = 1024
     a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), ctx)
     a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     func(a0, a1, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()),
         rtol=1e-5)
コード例 #26
0
ファイル: test_topi_transform.py プロジェクト: LANHUIYING/tvm
 def check_device(device):
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         print("Skip because %s is not enabled" % device)
         return
     print("Running on target: %s" % device)
     target = topi.cpp.TEST_create_target(device)
     if device == "llvm":
         s = topi.cpp.generic.schedule_injective(target, [B])
     else:
         s = topi.cpp.cuda.schedule_injective(target, [B])
     ctx = tvm.context(device, 0)
     foo = tvm.build(s, [A, B], device, name="tranpose")
     data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
     out_npy = data_npy.transpose(axes)
     data_nd = tvm.nd.array(data_npy, ctx)
     out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
     foo(data_nd, out_nd)
     tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
コード例 #27
0
ファイル: test_ewise_fpga.py プロジェクト: LANHUIYING/tvm
 def check_device(device, host="llvm"):
     if not tvm.module.enabled(host):
         return
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         return
     fadd = tvm.build(s, [A, B, C, D],
                      device, host,
                      name="myadd")
     ctx = tvm.context(device, 0)
     # launch the kernel.
     n = 1024
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx)
     d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx)
     fadd(a, b, c, d)
     tvm.testing.assert_allclose(
         d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
コード例 #28
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            # declare
            DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter,
                                                             (stride_h, stride_w),
                                                             padding_args,
                                                             (dilation, dilation),
                                                             in_layout,
                                                             out_layout, dtype)
            # TODO: add scale_shift implement for NCHWc and add test here
            Relu = topi.nn.relu(DepthwiseConv2d)
            # schedule
            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
            s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Relu], device)

        # Prepare pod type for test data closure
        input_shape = (batch, in_channel, in_height, in_width)
        filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width)

        # Use memoize, pickle the test data for next time use.
        @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc")
        def get_ref_data():
            input_np = np.random.uniform(size=input_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            # correctness with scipy
            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
                input_np, filter_np, stride, padding)
            relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
            return (_transform_data(input_np, ic_block),
                    _transform_kernel(filter_np, oc_block),
                    _transform_data(depthwise_conv2d_scipy, oc_block),
                    _transform_data(relu_scipy, oc_block))

        # Get the test data
        (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data()

        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                                                     dtype=DepthwiseConv2d.dtype), ctx)
        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
        # launch kernel 2 (depthwise_conv2d + relu)
        f2(input_tvm, filter_tvm, relu_tvm)
        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
コード例 #29
0
def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
    # TODO(tqchen) add more types once the schedule register is fixed.
    for target in ["llvm"]:
        ctx = tvm.context(target, 0)
        if not ctx.exist:
            return
        intrp = create_executor(mod=mod, ctx=ctx, target=target)
        result = intrp.evaluate(expr)(*args)
        # use tvm.testing which also set atol
        tvm.testing.assert_allclose(
            result.asnumpy(), expected_result, rtol=rtol)
コード例 #30
0
ファイル: run_model.py プロジェクト: LANHUIYING/tvm
def main():
    ctx = tvm.context('cpu', 0)
    model = tvm.module.load(osp.join(CWD, 'build', 'enclave.signed.so'))
    inp = tvm.nd.array(np.ones((1, 3, 224, 224), dtype='float32'), ctx)
    out = tvm.nd.array(np.empty((1, 1000), dtype='float32'), ctx)
    model(inp, out)
    if abs(out.asnumpy().sum() - 1) < 0.001:
        print('It works!')
    else:
        print('It doesn\'t work!')
        exit(1)
コード例 #31
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        if default_schedule:
            device += " -libs=cudnn"
        print("Running on target: %s" % device)

        with tvm.target.create(device):
            # declare
            # (algo = 0)
            DepthwiseConv2d = topi.cuda.depthwise_conv2d.depthwise_conv2d_cuda(
                autotvm.get_config(),
                Input,
                Filter, (stride_w, stride_h), (1, 1),
                dilation=1,
                algo=cudnn_algo
            ) if default_schedule else topi.nn.depthwise_conv2d_nhwc(
                Input,
                Filter,
                stride=[stride_h, stride_w],
                padding=padding,
                dilation=1)
            # ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
            # Relu = topi.nn.relu(ScaleShift)

            s1 = topi.cuda.depthwise_conv2d.schedule_depthwise_conv2d_nchw_cuda(
                autotvm.get_config(), [DepthwiseConv2d]
            ) if default_schedule else schedule_depthwise_conv2d_nhwc_reuse(
                [DepthwiseConv2d], Input)
            # s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
            # s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
            # s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
            # s3 = schedule_depthwise_conv2d_nhwc_reuse(Relu)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d],
                       device,
                       name="DepthwiseConv2d_%d_%d" % (in_height, in_width))
        # f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        # f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

        # Prepare pod type for test data closure
        dtype = Input.dtype
        input_shape = get_const_tuple(Input.shape)
        filter_shape = get_const_tuple(Filter.shape)
        # scale_shape = get_const_tuple(Scale.shape)
        # shift_shape = get_const_tuple(Shift.shape)
        # scale_shift_shape = get_const_tuple(ScaleShift.shape)

        # prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        # scale_tvm = tvm.nd.array(scale_np, ctx)
        # shift_tvm = tvm.nd.array(shift_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=dtype), ctx)
        # scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=dtype), ctx)
        # relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # launch kernel 2 (depthwise_conv2d + scale_shift)
        # timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=10)
        # tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
        # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
        # timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=10)
        # tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
        # relu_scipy = np.maximum(scale_shift_scipy, 0)
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   output_np,
                                   rtol=1e-5)
        # np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
        # np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print(
            "Depthwise convolution: average running time is {:.2f} us.".format(
                tcost_1 * 1e6))
コード例 #32
0
ファイル: test_Alexnet.py プロジェクト: CyanHillFox/tvm
def test_Alexnet():
    def Conv(data, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)):
        if pad[0] != 0 or pad[1] != 0:
            data = nnvm.symbol.pad(data=data,
                                   pad_width=((0, 0), (pad[0], pad[0]),
                                              (pad[1], pad[1]), (0, 0)))
        datas = nnvm.symbol.conv2d(data=data,
                                   kernel_size=kernel_size,
                                   channels=filter_nums,
                                   strides=stride,
                                   layout='NHWC',
                                   kernel_layout='HWOI')
        datas = nnvm.symbol.relu(data=datas)
        return datas

    def get_symbol(datas, num_classes):
        conv1 = Conv(data=datas,
                     kernel_size=(11, 11),
                     filter_nums=96,
                     stride=(4, 4))
        pool1 = nnvm.symbol.max_pool2d(data=conv1,
                                       pool_size=(3, 3),
                                       strides=(2, 2),
                                       layout='NHWC')
        conv2 = Conv(data=pool1,
                     kernel_size=(5, 5),
                     filter_nums=256,
                     pad=(2, 2))
        pool2 = nnvm.symbol.max_pool2d(data=conv2,
                                       pool_size=(3, 3),
                                       strides=(2, 2),
                                       layout='NHWC')
        conv3 = Conv(data=pool2,
                     kernel_size=(3, 3),
                     filter_nums=384,
                     pad=(1, 1))
        conv4 = Conv(data=conv3,
                     kernel_size=(3, 3),
                     filter_nums=384,
                     pad=(1, 1))
        conv5 = Conv(data=conv4,
                     kernel_size=(3, 3),
                     filter_nums=256,
                     pad=(1, 1))
        pool3 = nnvm.symbol.max_pool2d(data=conv5,
                                       pool_size=(3, 3),
                                       strides=(2, 2),
                                       layout='NHWC')
        datas = nnvm.symbol.flatten(data=pool3)
        fc1 = nnvm.symbol.dense(data=datas, units=1024)
        relu1 = nnvm.symbol.relu(data=fc1)
        drop1 = nnvm.symbol.dropout(data=relu1, rate=0.5)
        fc2 = nnvm.symbol.dense(data=drop1, units=1024)
        relu2 = nnvm.symbol.relu(data=fc2)
        drop2 = nnvm.symbol.dropout(data=relu2, rate=0.5)
        fc3 = nnvm.symbol.dense(data=drop2, units=16)
        symbol = nnvm.symbol.softmax(fc3)
        return symbol

    input_shape = (1, 128, 128, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 1
    z = get_symbol(datas=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    print(compute_graph.ir())
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='SC')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.randint(size=input_shape, low=-32, high=32)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        # module.run()
        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean * 10)
コード例 #33
0
def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter,
                                 kernel, stride, padding, activation_bits,
                                 weight_bits, unipolar):
    in_height = in_width = in_size
    input_type = 'uint32'
    out_dtype = 'int16'

    device = 'llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon'
    with tvm.target.create(device):
        A = te.placeholder((batch, in_height, in_width, in_channel),
                           dtype=input_type,
                           name='A')
        W = te.placeholder((kernel, kernel, in_channel, num_filter),
                           dtype=input_type,
                           name='W')
        B = topi.arm_cpu.bitserial_conv2d_nhwc(A, W, stride, padding,
                                               activation_bits, weight_bits,
                                               'uint8', out_dtype, unipolar)
        s = topi.arm_cpu.schedule_bitserial_conv2d_nhwc([B])

    func = tvm.build(s, [A, W, B], device)

    assembly = func.get_source('asm')
    matches = re.findall("vpadal", assembly)
    assert (len(matches) > 0)
    matches = re.findall("vcnt", assembly)
    assert (len(matches) > 0)
    matches = re.findall("vpadd", assembly)
    assert (len(matches) > 0)

    ctx = tvm.context(device, 0)
    if 'arm' not in os.uname()[4]:
        print("Skipped running code, not an arm device")
        return

    print("Running on target: %s" % device)

    def get_ref_data():
        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits,
                                     input_type)
        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits,
                                     input_type)
        if unipolar:
            w_ = np.copy(w_np).astype(out_dtype)
            for x in np.nditer(w_, op_flags=['readwrite']):
                x[...] = 1 if x == 1 else -1
            b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride,
                                                   padding).astype(out_dtype)
        else:
            b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride,
                                                   padding).astype(out_dtype)
        return a_np, w_np, b_np

    a_np, w_np, b_np = get_ref_data()
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
    func = tvm.build(s, [A, W, B], device)

    func(a, w, b)
    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
コード例 #34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--target',
                        type=str,
                        required=True,
                        choices=['cuda', 'rocm', 'opencl', 'metal'],
                        help="Compilation target.")
    parser.add_argument('--opt-level',
                        type=int,
                        default=1,
                        help="Level of optimization.")
    parser.add_argument('--num-iter',
                        type=int,
                        default=1000,
                        help="Number of iteration during benchmark.")
    parser.add_argument('--repeat',
                        type=int,
                        default=1,
                        help="Number of repeative times.")
    args = parser.parse_args()
    opt_level = args.opt_level
    num_iter = args.num_iter
    ctx = tvm.context(args.target, 0)
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size, ) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(batch_size=1,
                                                       image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=(args.target != "cuda")):
            graph, lib, params = nnvm.compiler.build(
                net, args.target, shape={"data": data_shape}, params=params)

    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(args.repeat):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding device overheat
        if i + 1 != args.repeat:
            time.sleep(45)
コード例 #35
0
from nnvm import compiler
from nnvm.frontend import from_mxnet
from tvm.contrib.download import download
from tvm.contrib import graph_runtime
from mxnet.model import load_checkpoint
from utils import save_tvm_params, save_tvm_graph
from load_deploy_model import load_mxnet_model
from time import time

tgt_host = "llvm"

# Change it to respective GPU if gpu is enabled Ex: cuda, opencl

tgt = "llvm"

ctx = tvm.context(tgt, 0)

target = 'llvm'

shapes = dict()

shapes['cls_prob'] = (1, 21, 5186)
shapes['loc_preds'] = (1, 20744)
shapes['anchor_boxes'] = (1, 5186, 4)

net = load_nms('model/deploy_ssd_inceptionv3_512-nms-symbol')

net, params = nnvm.frontend.from_mxnet(net)

print("[*] Compile...")
コード例 #36
0
ファイル: tune_network_cuda.py プロジェクト: jiajuns/tvm
#################################################################
# Compile and Evaluate
# --------------------
# After auto-tuning, we can compile the network with the best schedules we found.
# All measurement records are dumped into the log file during auto-tuning,
# so we can read the log file and load the best schedules.

# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(
            opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(mod, target=target, params=params)

# Create graph runtime
ctx = tvm.context(str(target), 0)
module = graph_runtime.GraphModule(lib["default"](ctx))
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input("data", data_tvm)

# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
      (np.mean(prof_res), np.std(prof_res)))

#################################################################
# Other Tips
# ----------
# 1. During the tuning, the auto-scheduler needs to compile many programs and
コード例 #37
0
def test_db_filter():
    logging.info("test db filter ...")

    # Pick a GPU target because there are more likely to be failures/invalid configs
    task, target = get_sample_task()

    ctx = tvm.context(str(target))
    if not ctx.exist:
        logging.warning(
            "Skip this test because there is no supported device for test")

    batch_size = 2

    measure_option = autotvm.measure_option(mode='local-nofork', timeout=2)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    ct = 0
    all_inputs = list()
    all_results = list()
    batches = list()
    tuner = autotvm.tuner.RandomTuner(task)
    while ct < TRIAL_LIMIT:
        inputs = list()
        for i in range(batch_size):
            cfg = tuner.next_batch(1)[0]
            inputs.append((MeasureInput(target, task, cfg)))
            all_inputs.append(inputs[-1])
        batches.append(inputs)
        results = measure_batch(inputs)
        all_results += results
        ct += 1

    del measure_batch

    db = database.DummyDatabase()
    db.flush()

    # First setting, memoize one input at a time, check that each is saved and replayed
    measure_option = autotvm.measure_option(mode='local-nofork',
                                            timeout=2,
                                            replay_db=db)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    for i in range(len(all_inputs) + 1):
        db.flush()
        for j in range(i):
            db.save(all_inputs[j], all_results[j])

        for k in range(len(batches)):
            batch = batches[k]
            batch_result = measure_batch(batch)
            for l in range(batch_size):
                all_idx = k * batch_size + l
                assert batch_result[l] is not None
                if all_idx < i:
                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
                else:
                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
                        "(no retry) EXPECTED MISMATCH, GOT MATCH"

    del measure_batch
コード例 #38
0
ファイル: tensorize.py プロジェクト: x00353666/tvm
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(cc_code, output=ll_path)
    return ll_code


######################################################################
# Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline.
# The importing needs to happen before the tensorized GEMV being executed.
#
s[C].pragma(x, "import_llvm", gemv_impl())
func = tvm.build(s, [A, B, C], target="llvm", name="gemv")

from topi.util import get_const_tuple
dtype = A.dtype
ctx = tvm.context("cpu", 0)
a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)

######################################################################
# We compare the tensorize version with that :code:`numpy.dot` produces,
# ensure our implementation is correct.
#
# Reduce-update for Tensorize
# ------------------------------------
# Let's then move one step forward.
# Assume our accelerator could only multiply a vector by a square matrix,
# in which the vector size needs to be no larger than 16.
コード例 #39
0
def test_densenet():
    def Conv(datas, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)):
        if pad[0] != 0 or pad[1] != 0:
            datas = nnvm.symbol.pad(data=datas,
                                    pad_width=((0, 0), (pad[0], pad[0]),
                                               (pad[1], pad[1]), (0, 0)))
        conv = nnvm.symbol.conv2d(data=datas,
                                  kernel_size=kernel_size,
                                  channels=filter_nums,
                                  strides=stride,
                                  layout='NHWC',
                                  kernel_layout='HWOI')
        return conv

    def bottleneck_layer(datas, filters):
        bn1 = nnvm.symbol.batch_norm(data=datas, epsilon=2e-5, axis=3)
        relu1 = nnvm.symbol.relu(data=bn1)
        conv1 = Conv(datas=relu1, kernel_size=(1, 1), filter_nums=4 * filters)
        bn2 = nnvm.symbol.batch_norm(data=conv1, epsilon=2e-5, axis=3)
        relu2 = nnvm.symbol.relu(data=bn2)
        conv2 = Conv(datas=relu2,
                     kernel_size=(3, 3),
                     filter_nums=filters,
                     pad=(1, 1))
        return conv2

    def transition_layer(datas, filters):
        conv = Conv(datas=datas, kernel_size=(1, 1), filter_nums=filters)

        pool = nnvm.symbol.avg_pool2d(data=conv,
                                      pool_size=(2, 2),
                                      strides=(2, 2),
                                      layout='NHWC')
        return pool

    def dense_block(datas, filters, layers):
        layers_concat = []
        layers_concat.append(datas)
        b_l = bottleneck_layer(datas, filters)

        layers_concat.append(b_l)
        for i in range(layers - 1):
            x = nnvm.symbol.concatenate(*layers_concat, axis=3)
            x = bottleneck_layer(x, filters)
            layers_concat.append(x)
        return x

    def get_symbol(datas, num_classes=16):
        x = Conv(datas, kernel_size=(7, 7), filter_nums=32, stride=(2, 2))

        x = nnvm.symbol.max_pool2d(x,
                                   pool_size=(3, 3),
                                   strides=(2, 2),
                                   layout='NHWC')

        b1 = dense_block(x, 32, 6)

        l1 = transition_layer(b1, 32)

        b2 = dense_block(l1, 32, 12)
        l2 = transition_layer(b2, 32)
        b3 = dense_block(l2, 32, 48)
        l3 = transition_layer(b3, 32)
        b4 = dense_block(l3, 32, 32)
        x = nnvm.symbol.global_avg_pool2d(data=b4, layout='NHWC')
        x = nnvm.symbol.flatten(data=x)
        fc = nnvm.symbol.dense(data=x, units=16)
        symbol = nnvm.symbol.softmax(data=fc)
        return symbol

    input_shape = (1, 229, 229, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 3
    z = get_symbol(datas=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(size=input_shape)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        module.run()

        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean)
コード例 #40
0
def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None):
    def tvm_val_2_py_val(val):
        val = tvm.ir_pass.Substitute(val, var_dict)
        val = tvm.ir_pass.Simplify(val)
        assert isinstance(val, (tvm.expr.IntImm, tvm.expr.UIntImm))
        return val.value

    ctx = tvm.context(target, 0)
    op = None

    if sch is None:
        outs = func(*tuple(
            tvm.convert(i) if isinstance(i, list) else i for i in args))
        op = outs[0].op if isinstance(outs, list) else outs.op
        sch = tvm.create_schedule(op)
    else:
        assert outs is not None
        assert isinstance(outs, list)
        op = outs[0].op

    emu_args = []
    nd_args = []
    for i in args:
        if isinstance(i, tvm.tensor.Tensor):
            shape = [tvm_val_2_py_val(j) for j in i.shape]
            emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
            nd_args.append(tvm.nd.array(emu_args[-1], ctx))
        elif isinstance(i, tvm.expr.Var):
            emu_args.append(tvm_val_2_py_val(i))
            nd_args.append(emu_args[-1])
        else:
            assert isinstance(i, list)
            emu_args.append(numpy.array(i))

    compile_args = [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var))] + \
                   (outs if isinstance(outs, list) else [outs])
    module = tvm.build(sch, compile_args, target=target)
    assert module

    out_tensors = []
    for i in range(op.num_outputs):
        output = op.output(i)
        shape = [tvm_val_2_py_val(j) for j in output.shape]
        nd_args.append(
            tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx))
        out_tensors.append(nd_args[-1])

    ref_data = func(*emu_args)
    if isinstance(ref_data, numpy.ndarray):
        ref_data = [ref_data]

    module(*nd_args)

    for nd, np in zip(out_tensors, ref_data):
        tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)

    module_args = [
        i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var))
    ]
    module_outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    h_module = tvm.hybrid.build(sch, module_args, module_outs)

    return h_module, module_args, module_outs
コード例 #41
0
import tvm.topi.testing
from tvm.topi.util import get_const_int, get_const_tuple
from tvm.topi.nn.bitserial_util import bitpack, binary_op_multiplier
from tvm.topi.nn import bitserial_dense
from tvm.topi.x86 import schedule_reduce

M = 1024
K = 1024
N = 1024

dtype = "uint32"

target = 'llvm -mcpu=core-avx2'
# target = 'llvm'
ctx = tvm.context(target, 0)

_bitserial_dense_implement = {
    "generic":
    (topi.nn.bitserial_dense, topi.generic.schedule_bitserial_dense),
    "cpu": (topi.x86.bitserial_dense, topi.x86.schedule_bitserial_dense),
    "arm_cpu":
    (topi.arm_cpu.bitserial_dense, topi.arm_cpu.schedule_bitserial_dense),
}

a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx)
b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx)
c = tvm.nd.array(np.zeros((M, N), dtype='int16'), ctx)

# numpy matrix multi
np_repeat = 100
コード例 #42
0
ファイル: conv2d_nchw_rasp.py プロジェクト: yzhliu/topi-intel
    def check_device():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel_size, kernel_size),
                            name='W')

        out_dtype = 'float32'
        wkl = _get_workload(A, W, stride, padding, out_dtype)
        sch = Im2ColPack(7, 8, 1, 8, True)

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)

        dtype = A.dtype

        @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()
        # device = 'llvm'
        device = 'llvm -mcpu=skylake-avx512'
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)

        with tvm.build_config(auto_unroll_max_step=1400,
                              unroll_explicit=(device != "cuda")):

            B = _im2col_pack(wkl, sch, A, W, stride, padding, out_dtype)
            s = tvm.create_schedule(B.op)
            traverse(s, B.op)

            op = B.op
            output = op.output(0)
            conv_out = op.input_tensors[0]
            kernel_vec = conv_out.op.input_tensors[1]
            kernel = kernel_vec.op.input_tensors[0]
            data_vec = conv_out.op.input_tensors[0]
            data_col = data_vec.op.input_tensors[0]
            data = data_col.op.input_tensors[0]
            data_pad = None
            if isinstance(data.op,
                          tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                data_pad = data
                data = data_pad.op.input_tensors[0]
            _schedule_im2col_conv2d(wkl, sch, s, data, data_pad, data_col,
                                    data_vec, kernel, kernel_vec, conv_out,
                                    output, B)

            print(tvm.lower(s, [A, W, B], simple_mode=True))

            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                             ctx)
            func = tvm.build(s, [A, W, B], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a, w, b).mean
            print('conv: %g secs/op' % cost)

            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            print(b_np.shape)
コード例 #43
0
ファイル: test_pass_annotation.py プロジェクト: xchani/tvm
def run_unpropagatable_graph(dev, tgt):
    R""" The network is as following:
            a     b  c     d
             \   /    \   /
              add      mul
                \      /
                subtract
    """

    a = relay.var("a", shape=(10, 10))
    b = relay.var("b", shape=(10, 10))
    c = relay.var("c", shape=(10, 10))
    d = relay.var("d", shape=(10, 10))
    a_data = np.random.rand(10, 10).astype('float32')
    b_data = np.random.rand(10, 10).astype('float32')
    c_data = np.random.rand(10, 10).astype('float32')
    d_data = np.random.rand(10, 10).astype('float32')
    tmp_add = a_data + b_data
    tmp_mul = np.multiply(c_data, d_data)
    ref_res = np.subtract(tmp_add, tmp_mul)

    fallback_device = tvm.context("cpu")
    target = {"cpu": "llvm", dev: tgt}
    cpu_ctx = fallback_device
    dev_ctx = tvm.context(dev)

    def annotated():
        add = relay.add(a, b)
        _add = relay.annotation.on_device(add, dev_ctx)
        mul = relay.multiply(c, d)
        _mul = relay.annotation.on_device(mul, cpu_ctx)
        sub = relay.subtract(_add, _mul)
        _sub = relay.annotation.on_device(sub, dev_ctx)
        func = relay.Function([a, b, c, d], _sub)
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.rewrite_annotated_ops(func, dev_ctx.device_type)
        return func

    def expected():
        add = relay.add(a, b)
        mul = relay.multiply(c, d)
        copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx)
        sub = relay.subtract(add, copy_mul_sub)
        func = relay.Function([a, b, c, d], sub)
        return func

    annotated_func = annotated()
    expected_func = expected()
    expected_index = [2, 2, 2, 1, 1, 1, 2, 2]
    check_annotated_graph(annotated_func, expected_func)
    params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data}
    config = {"opt_level": 0}
    config["fallback_device"] = fallback_device
    with relay.build_config(**config):
        graph, lib, params = relay.build(annotated_func, target, params=params)
        contexts = [tvm.cpu(0), tvm.context(dev)]
        graph_json = json.loads(graph)
        if "device_index" in graph_json["attrs"]:
            device_index = graph_json["attrs"]["device_index"][1]
            assert device_index == expected_index
        mod = graph_runtime.create(graph, lib, contexts)
        mod.set_input(**params)
        mod.run()
        res = mod.get_output(0).asnumpy()
        tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
コード例 #44
0
    def check_device(device, target_device):
        if not tvm.module.enabled(target_device):
            print("Skip test because {} is not enabled.".format(target_device))
            return

        device_ctx = tvm.context(device)
        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
        shape = (4, )

        # Insert copy nodes for data transferring between add and sub nodes.
        # Transfers data from gpu to cpu.
        copy_add_sub = tvm.placeholder(shape, name="__copy0")
        # Transfers data from cpu to gpu.
        copy_sub_add = tvm.placeholder(shape, name="__copy1")

        # Create a module containing adds on the device.
        tensor_a = tvm.placeholder(shape, name="A")
        tensor_b = tvm.placeholder(shape, name="B")
        tensor_d = tvm.placeholder(shape, name="D")
        elemwise_add0 = tvm.compute(shape,
                                    lambda *i: tensor_a(*i) + tensor_b(*i),
                                    name="elemwise_add0")
        elemwise_add1 = tvm.compute(shape,
                                    lambda *i: copy_sub_add(*i) + tensor_d(*i),
                                    name="elemwise_add1")
        target = topi.cpp.TEST_create_target(device)
        add_schedule0 = topi.cpp.cuda.schedule_injective(
            target, [elemwise_add0])
        lower_add0 = tvm.lower(add_schedule0,
                               [tensor_a, tensor_b, elemwise_add0],
                               name="elemwise_add0")
        add_schedule1 = topi.cpp.cuda.schedule_injective(
            target, [elemwise_add1])
        lower_add1 = tvm.lower(add_schedule1,
                               [tensor_d, copy_sub_add, elemwise_add1],
                               name="elemwise_add1")
        # Create module for sub whose target is the host.
        tensor_c = tvm.placeholder(shape, name="C")
        elemwise_sub = tvm.compute(shape,
                                   lambda *i: copy_add_sub(*i) - tensor_c(*i),
                                   name="elemwise_sub")
        sub_schedule = tvm.create_schedule(elemwise_sub.op)
        lower_sub = tvm.lower(sub_schedule,
                              [copy_add_sub, tensor_c, elemwise_sub],
                              name="elemwise_sub")

        target_flist = {
            target_device: [lower_add0, lower_add1],
            target_host: [lower_sub]
        }
        mhost = tvm.build(target_flist, target_host=target_host)
        ctx = [host_ctx, device_ctx]
        params = {}
        params["A"] = tensor_a = np.random.uniform(size=shape).astype(
            tensor_a.dtype)
        params["B"] = tensor_b = np.random.uniform(size=shape).astype(
            tensor_b.dtype)
        params["C"] = tensor_c = np.random.uniform(size=shape).astype(
            tensor_c.dtype)
        params["D"] = tensor_d = np.random.uniform(size=shape).astype(
            tensor_d.dtype)

        def check_verify():
            mod = graph_runtime.create(graph, mhost, ctx)
            mod.set_input(**params)
            mod.run()
            out = mod.get_output(0, tvm.nd.empty(shape))
            np.testing.assert_equal(out.asnumpy(),
                                    tensor_a + tensor_b - tensor_c + tensor_d)

        def check_load_module():
            temp = util.tempdir()
            path_lib = temp.relpath("deploy.so")
            mhost.export_library(path_lib)
            with open(temp.relpath("deploy.json"), "w") as out_file:
                out_file.write(graph)
            loaded_lib = tvm.module.load(path_lib)
            loaded_graph = open(temp.relpath("deploy.json")).read()
            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
            mod.set_input(**params)
            mod.run()
            out = mod.get_output(0, tvm.nd.empty(shape))
            np.testing.assert_equal(out.asnumpy(),
                                    tensor_a + tensor_b - tensor_c + tensor_d)

        check_verify()
        check_load_module()
コード例 #45
0
ファイル: test_pass_annotation.py プロジェクト: xchani/tvm
def test_conv_network():
    R""" The network is as following:
             data1     data2
               |         |
             conv2d    conv2d
                \       /
                   add
                    |
                  conv2d
    """
    batch_size = 1
    dshape = (batch_size, 64, 56, 56)
    weight = relay.var("weight", shape=(64, 64, 3, 3))
    data1 = relay.var("data1", shape=dshape)
    data2 = relay.var("data2", shape=dshape)
    dev1 = tvm.context(1)
    dev2 = tvm.context(2)

    def original():
        conv2d_1 = relay.nn.conv2d(data1,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        conv2d_2 = relay.nn.conv2d(data2,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        add = relay.add(conv2d_1, conv2d_2)
        conv2d_3 = relay.nn.conv2d(add,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))

        func = relay.Function([data1, data2, weight], conv2d_3)
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.rewrite_annotated_ops(func,
                                                   tvm.context(3).device_type)
        return func

    def annotated():
        conv2d_1 = relay.nn.conv2d(data1,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        _conv2d_1 = relay.annotation.on_device(conv2d_1, dev2)
        conv2d_2 = relay.nn.conv2d(data2,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        _conv2d_2 = relay.annotation.on_device(conv2d_2, dev2)
        add = relay.add(_conv2d_1, _conv2d_2)
        _add = relay.annotation.on_device(add, dev1)
        conv2d_3 = relay.nn.conv2d(_add,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2)

        func = relay.Function([data1, data2, weight], _conv2d_3)
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.rewrite_annotated_ops(func,
                                                   tvm.context(3).device_type)
        return func

    class ScheduleConv2d(ExprMutator):
        def __init__(self, device):
            self.device = device
            super().__init__()

        def visit_call(self, expr):
            visit = super().visit_call(expr)
            if expr.op == tvm.relay.op.get("nn.conv2d"):
                return relay.annotation.on_device(visit, self.device)
            else:
                return visit

    def annotate_with_visitor(func):
        sched = ScheduleConv2d(dev2)
        func = sched.visit(func)
        func = relay.ir_pass.rewrite_annotated_ops(func, dev1.device_type)
        return func

    def expected():
        conv2d_1 = relay.nn.conv2d(data1,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        device_copy1 = relay.device_copy(conv2d_1, dev2, dev1)
        conv2d_2 = relay.nn.conv2d(data2,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))
        device_copy2 = relay.device_copy(conv2d_2, dev2, dev1)
        add = relay.add(device_copy1, device_copy2)
        device_copy3 = relay.device_copy(add, dev1, dev2)
        conv2d_3 = relay.nn.conv2d(device_copy3,
                                   weight,
                                   channels=64,
                                   kernel_size=(3, 3),
                                   padding=(1, 1))

        func = relay.Function([data1, data2, weight], conv2d_3)
        return func

    def check_storage_and_device_types():
        func = annotated()
        func = relay.ir_pass.rewrite_annotated_ops(func, 3)
        func = relay.ir_pass.infer_type(func)
        func = relay.ir_pass.fuse_ops(func, opt_level=2)
        func = relay.ir_pass.infer_type(func)
        smap = relay.backend._backend.GraphPlanMemory(func)
        storage_ids = []
        device_types = []
        for _, storage_dev_type in smap.items():
            assert len(storage_dev_type) == 2
            for sid in storage_dev_type[0]:
                storage_ids.append(sid.value)
            for did in storage_dev_type[1]:
                device_types.append(did.value)
        assert len(storage_ids) == 10
        assert len(set(storage_ids)) == 8
        assert len(set(device_types)) == 2
        assert set(device_types) == {1, 2}

    def test_manual_annotation():
        annotated_func = annotated()
        expected_func = expected()
        check_annotated_graph(annotated_func, expected_func)
        check_storage_and_device_types()

    def test_visitor_annotation():
        annotated_func = annotate_with_visitor(original())
        expected_func = expected()
        check_annotated_graph(annotated_func, expected_func)

    test_manual_annotation()
    test_visitor_annotation()
コード例 #46
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        task = autotvm.task.create(schedule_depthwise_conv2d_nhwc_reuse_auto,
                                   args=(batch, in_channel, in_size,
                                         channel_multiplier, kernel, stride),
                                   target="cuda")
        print(task)
        print(task.config_space)

        # logging config (for printing tuning log to the screen)
        logging.getLogger('autotvm').setLevel(logging.DEBUG)
        logging.getLogger('autotvm').addHandler(
            logging.StreamHandler(sys.stdout))

        # There are two steps for measuring a config: build and run.
        # By default, we use all cpu cores to compile program. Then measure them sequentially.
        # We measure 5 times and take average to reduce variance.
        measure_option = autotvm.measure_option(
            builder='local', runner=autotvm.LocalRunner(number=10))

        tuner = autotvm.tuner.RandomTuner(task)
        tuner.tune(n_trial=25,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.log_to_file(
                           'depthwise_conv2d_nhwc_{}.log'.format(in_size))
                   ])

        with autotvm.apply_history_best(
                'depthwise_conv2d_nhwc_{}.log'.format(in_size)):
            with tvm.target.create(device):
                s1, [Input, Filter, DepthwiseConv2d
                     ] = schedule_depthwise_conv2d_nhwc_reuse_auto(
                         batch, in_channel, in_size, channel_multiplier,
                         kernel, stride)
                # s3 = schedule_depthwise_conv2d_nhwc_reuse(Relu)
                # build the kernels
                f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d],
                               device,
                               name="ddd%dddd" % in_size)
                # f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
                # f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

        # Prepare pod type for test data closure
        dtype = Input.dtype
        input_shape = get_const_tuple(Input.shape)
        filter_shape = get_const_tuple(Filter.shape)
        # scale_shape = get_const_tuple(Scale.shape)
        # shift_shape = get_const_tuple(Shift.shape)
        # scale_shift_shape = get_const_tuple(ScaleShift.shape)

        # Use memoize, pickle the test data for next time use.
        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc")
        def get_ref_data():
            input_np = np.random.uniform(size=input_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            # scale_np = np.random.uniform(size=scale_shape).astype(dtype)
            # shift_np = np.random.uniform(size=shift_shape).astype(dtype)
            # correctness with scipy
            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc(
                input_np,
                filter_np,
                stride=[stride_h, stride_w],
                padding=padding)
            # scale_shift_scipy = np.zeros(shape=scale_shift_shape)
            # for c in range(in_channel * channel_multiplier):
            #     scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
            #     relu_scipy = np.maximum(scale_shift_scipy, 0)
            # return (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
            return (input_np, filter_np, depthwise_conv2d_scipy)

        # Get the test data
        # (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()
        (input_np, filter_np, depthwise_conv2d_scipy) = get_ref_data()

        # prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        # scale_tvm = tvm.nd.array(scale_np, ctx)
        # shift_tvm = tvm.nd.array(shift_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        # scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
        # relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=10)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # launch kernel 2 (depthwise_conv2d + scale_shift)
        # timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=10)
        # tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
        # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
        # timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=10)
        # tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
        # relu_scipy = np.maximum(scale_shift_scipy, 0)
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   depthwise_conv2d_scipy,
                                   rtol=1e-5)
        # np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
        # np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print(
            "Depthwise convolution: average running time is {:.2f} us.".format(
                tcost_1 * 1e6))
コード例 #47
0
def check_eval(expr, expected_result, mod=None, rtol=1e-07):
    ctx = tvm.context("llvm", 0)
    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")

    result = intrp.evaluate(expr)
    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
コード例 #48
0
    check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32")

    for in_dtype in ["float32", "float64"]:
        data = np.random.randn(10, 10).astype(in_dtype)
        check_cumsum(np.cumsum(data), data)
        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
        check_cumsum(np.cumsum(data, axis=1), data, axis=1)

        data = np.random.randn(10, 5, 10).astype(in_dtype)
        check_cumsum(np.cumsum(data), data)
        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
        check_cumsum(np.cumsum(data, axis=-1), data, axis=-1)

    for in_dtype in ["int32", "int64"]:
        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
        check_cumsum(np.cumsum(data, dtype=in_dtype), data)
        check_cumsum(np.cumsum(data), data, dtype="int64")
        check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0)
        check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1)

        data = np.random.randint(1 << 30, (1 << 31) - 1,
                                 size=(100)).astype(in_dtype)
        check_cumsum(np.cumsum(data), data, dtype="int64")


if __name__ == "__main__":
    test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm"))
    test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda"))
    test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))
コード例 #49
0
ファイル: test_pass_annotation.py プロジェクト: jiajuns/tvm
def test_propogation():
    R""" The network and device type is as following:
                  x           1
                  |
                 log          1
                /   \
              log2 log10      2
                \   /
                 add          2
                  |
                 tan          1
    """
    ctx1 = tvm.context(1)
    ctx2 = tvm.context(2)

    expected_dev_type = {
        "log": ctx1,
        "log2": ctx2,
        "log10": ctx2,
        "add": ctx2,
        "tan": ctx1
    }

    x = relay.var("x", shape=(3, ))

    def annotated():
        log = relay.log(x)
        _log = relay.annotation.on_device(log, expected_dev_type["log"])
        log2 = relay.log2(_log)
        _log2 = relay.annotation.on_device(log2, expected_dev_type["log2"])
        log10 = relay.log10(_log)
        _log10 = relay.annotation.on_device(log10, expected_dev_type["log10"])
        add = relay.add(_log2, _log10)
        _add = relay.annotation.on_device(add, expected_dev_type["add"])
        tan = relay.tan(_add)
        _tan = relay.annotation.on_device(tan, expected_dev_type["tan"])

        func = run_opt_pass(_tan,
                            transform.RewriteAnnotatedOps(ctx1.device_type))
        return func

    def expected():
        log = relay.log(x)
        _log_left = relay.device_copy(log, ctx1, ctx2)
        _log_right = relay.device_copy(log, ctx1, ctx2)
        log2 = relay.log2(_log_left)
        log10 = relay.log10(_log_right)
        add = relay.add(log2, log10)
        _add = relay.device_copy(add, ctx2, ctx1)
        tan = relay.tan(_add)

        func = run_opt_pass(tan, transform.InferType())
        return func

    annotated_expr = annotated()
    expected_expr = expected()
    assert tvm.ir.structural_equal(annotated_expr, expected_expr)

    smap = relay.backend._backend.GraphPlanMemory(annotated_expr)
    for expr, storage_dev_type in smap.items():
        # x is ctx1 as output is ctx1
        if isinstance(expr, tvm.relay.expr.Var):
            assert storage_dev_type[1][0] == ctx1.device_type
        else:
            # device_copy op should be its dst_dev_type
            if isinstance(expr.attrs, tvm.relay.op.op_attrs.DeviceCopyAttrs):
                assert storage_dev_type[1][0] == expr.attrs.dst_dev_type
            else:
                assert storage_dev_type[1][0] == expected_dev_type[
                    expr.op.name].device_type
コード例 #50
0
        A = tvm.placeholder((rA, cA), dtype='float16')
        B = tvm.placeholder((rB, cB), dtype='float16')
        C = tvm.placeholder((rA, rB), dtype='float32')
        #C = tvm.placeholder((rA,rB),dtype = 'float16')
        assert (cA == cB)
        D = tvm.compute((rA//(16*block_tiles),rB//(16*block_tiles),(num_thread*block_warp),1),\
                         lambda i,j,w,warp:(A[i*16*block_tiles+(w//(num_thread)//block_row_warp)*16*warp_col_tile,0]+\
                                              B[j*16*block_tiles+(w//(num_thread)%block_row_warp)*16*warp_row_tile,0]+\
                                              C[i*16*block_tiles+(w//num_thread//block_row_warp)*16*warp_col_tile,j*16*block_tiles+(w//num_thread%block_row_warp)*16*warp_row_tile].astype('float16')))
        s = schedule_gemm_fp16()
        print(tvm.lower(s, [A, B, C], name="matrix_dot", simple_mode=True))

        f = tvm.build(s, [A, B, C], target='cuda', name='gemm_fp16')
        print("build finished")

        ctx = tvm.context('cuda', 4)

        a_np = np.float16(np.random.uniform(0., 1., size=(rA, cA)))
        b_np = np.float16(np.random.uniform(0., 1., size=(rB, cB)))
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((rA, rB), dtype=C.dtype), ctx)
        #ci = tvm.nd.array(np.ones((rA//16, rB//16,16,16), dtype=C.dtype), ctx)
        f(a, b, c)
        ss = c.asnumpy()


        np.testing.assert_allclose(c.asnumpy(),\
                                  np.dot(np.float32(a_np),\
                                         np.float32(np.transpose(b_np))),
                                  rtol=1e-3)
コード例 #51
0
    def verify(target="llvm -mcpu=skylake-avx512"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return

        ctx = tvm.context(target, 0)
        X = tvm.placeholder((m, k), name='X', dtype="uint8")
        W = tvm.placeholder((n, k), name='W', dtype="int8")
        pc = dot_16x1x16_int8_int8_int16()
        ak = tvm.reduce_axis((0, k), name='k')

        packedW = tvm.placeholder((n / 128, 128 * (k / 2), 2),
                                  name='packedW',
                                  dtype="int8")
        t_fc = tvm.compute(
            (m, n),
            lambda i, j: tvm.sum(X[i, ak].astype("int16") * packedW[j / 128, (
                ak / 2) * 128 + j % 128, ak % 2].astype("int16"),
                                 axis=ak),
            name="F")

        t_sch = tvm.create_schedule(t_fc.op)
        a_x, a_y = t_fc.op.axis
        a_k, = t_fc.op.reduce_axis

        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128)
        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2)

        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128)
        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32)
        t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki)

        t_sch[t_fc].tensorize(a_yi, pc)
        # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True))
        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
        t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)

        # generate the plain data
        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
        b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")

        packW = np.random.uniform(1, 10, size=(n / 128, 128 * (k / 2),
                                               2)).astype("int8")
        # This occurs in pre_compute stage
        for r_idx in range(n / 128):
            for s_idx in range(128 * (k / 2)):
                for t_idx in range(2):
                    packW[r_idx][s_idx][t_idx] = b_[r_idx * 128 + s_idx %
                                                    128][s_idx / 128 * 2 +
                                                         t_idx]

        x = tvm.nd.array(a_, ctx)
        w = tvm.nd.array(packW, ctx)
        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx)

        result = t_evaluator(x, w, y)
        gops_per_sec = gops_per_mm / result.mean / 1e9
        tvm.testing.assert_allclose(y.asnumpy(), np.dot(a_, b_.T), rtol=1e-5)
        print(
            'Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.'
            .format(result.mean * 1000, gops_per_sec, gops_per_sec / peak))
        t_func.export_library("gemm_tensorize.o")
コード例 #52
0
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        task = autotvm.task.create(schedule_conv2d_nhwc_auto,
                                   args=(batch, in_channel, in_size,
                                         num_filter, kernel, stride),
                                   target="cuda")
        print(task.config_space)

        # logging config (for printing tuning log to the screen)
        logging.getLogger('autotvm').setLevel(logging.DEBUG)
        logging.getLogger('autotvm').addHandler(
            logging.StreamHandler(sys.stdout))

        # There are two steps for measuring a config: build and run.
        # By default, we use all cpu cores to compile program. Then measure them sequentially.
        # We measure 5 times and take average to reduce variance.
        measure_option = autotvm.measure_option(
            builder='local', runner=autotvm.LocalRunner(number=10))

        tuner = autotvm.tuner.RandomTuner(task)
        tuner.tune(n_trial=25,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.log_to_file(
                           'conv2d_nhwc_{}.log'.format(in_size))
                   ])

        with autotvm.apply_history_best('conv2d_nhwc_{}.log'.format(in_size)):
            with tvm.target.create(device):
                s, [A, W,
                    B] = schedule_conv2d_nhwc_auto(batch, in_channel, in_size,
                                                   num_filter, kernel, stride)
                func = tvm.build(s, [A, W, B],
                                 device,
                                 name=("ddd%dddd" % in_size))

        @memoize("verify_nhwc")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding)
            return a_np, w_np, b_np

        a_np, w_np, b_np = get_ref_data()

        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(b_np.shape), dtype=dtype),
                         ctx)

        func(a, w, b)
        timer_1 = func.time_evaluator(func.entry_name, ctx, number=10)
        tcost_1 = timer_1(a, w, b).mean
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
        print("1x1 convolution: average running time is {:.2f} us.".format(
            tcost_1 * 1e6))
コード例 #53
0
 def check_mod(mod, x_np, res_np):
     target = "vulkan"
     ctx = tvm.context(target, 0)
     ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
     res = ex.evaluate()(x_np).asnumpy()
     tvm.testing.assert_allclose(res, res_np, atol=1e-5)
コード例 #54
0
def test_simplex_data_transferring():
    r"""
    Test the heterogeneous execution of a simple network where data
    transferring is from the target device to the host processor at runtime.
    The host processor is always assumed to be cpu, and the device varies.
    """
    host = "cpu"
    target_host = "llvm"
    host_ctx = tvm.context(host)
    if not tvm.module.enabled(target_host):
        print("Skip test because llvm is not enabled.")
        return

    def check_device(device, target_device):
        if not tvm.module.enabled(target_device):
            print("Skip test because {} is not enabled.".format(target_device))
            return

        device_ctx = tvm.context(device)
        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
        shape = (4, )

        # Create module for add whose target is the device.
        tensor_a = tvm.placeholder(shape, name="A")
        tensor_b = tvm.placeholder(shape, name="B")
        elemwise_add = tvm.compute(shape,
                                   lambda *i: tensor_a(*i) + tensor_b(*i),
                                   name="elemwise_add")
        target = topi.cpp.TEST_create_target(device)
        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
                              name="elemwise_add")

        # Insert copy. Neither compute nor schedule is required for the copy
        # node. The compute will be performed at runtime which is just data
        # copy from the input to the output.
        tensor_copy = tvm.placeholder(shape, name="__copy")

        # Create module for sub whose target is the host.
        tensor_c = tvm.placeholder(shape, name="C")
        elemwise_sub = tvm.compute(shape,
                                   lambda *i: tensor_copy(*i) - tensor_c(*i),
                                   name="elemwise_sub")
        schedule_sub = tvm.create_schedule(elemwise_sub.op)
        lower_sub = tvm.lower(schedule_sub,
                              [tensor_copy, tensor_c, elemwise_sub],
                              name="elemwise_sub")

        target_flist = {target_device: [lower_add], target_host: [lower_sub]}
        mhost = tvm.build(target_flist, target_host=target_host)
        ctx = [host_ctx, device_ctx]
        mod = graph_runtime.create(graph, mhost, ctx)
        params = {}
        params["A"] = tensor_a = np.random.uniform(size=shape).astype(
            tensor_a.dtype)
        params["B"] = tensor_b = np.random.uniform(size=shape).astype(
            tensor_b.dtype)
        params["C"] = tensor_c = np.random.uniform(size=shape).astype(
            tensor_c.dtype)
        mod.set_input(**params)
        mod.run()
        out = mod.get_output(0, tvm.nd.empty(shape))
        np.testing.assert_equal(out.asnumpy(),
                                (tensor_a + tensor_b) - tensor_c)

    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
    for device, target in dev_tar.items():
        with tvm.target.create(device):
            check_device(device, target)
コード例 #55
0
                        "--print_loss_val_each_epoch",
                        help="Print loss value at the end of each epoch",
                        action="store_true")
    args = parser.parse_args()

    models = []
    executor_ctx = None
    print_loss_val_each_epoch = False
    if args.model == "logreg":
        models = [mnist_logreg]
    elif args.model == "mlp":
        models = [mnist_mlp]
    elif args.model == "all":
        models = [mnist_logreg, mnist_mlp]

    if args.executor_context == "cpu":
        tgt = "llvm"
        tgt_host = "llvm"
    elif args.executor_context == "gpu":
        tgt = "cuda"
        tgt_host = "llvm"
        assert False, "cuda codegen not ready"
    # create context object
    executor_ctx = tvm.context(tgt, 0)

    print_loss_val_each_epoch = True if args.print_loss_val_each_epoch \
                                     else False
    num_epochs = args.num_epoch
    for m in models:
        m(executor_ctx, num_epochs, print_loss_val_each_epoch)
コード例 #56
0
def compare_tflite_with_tvm(in_data,
                            in_name,
                            input_tensors,
                            output_tensors,
                            init_global_variables=False,
                            out_names=None,
                            quantized=False):
    """Generic function to generate and compare TFLite and TVM output"""
    in_data = convert_to_list(in_data)
    in_name = convert_to_list(in_name)
    out_names = convert_to_list(out_names)
    in_node = [0] * len(in_name)
    for i in range(len(in_name)):
        in_node[i] = in_name[i].split(
            ':')[0] if ":" in in_name[i] else in_name[i]

    with tf.Session() as sess:
        if init_global_variables:
            sess.run(variables.global_variables_initializer())
        # convert to tflite model
        converter = interpreter_wrapper.TFLiteConverter.from_session(
            sess, input_tensors, output_tensors)

        if quantized:
            converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
            input_arrays = converter.get_input_arrays()
            input_stats = {}
            # hardcode the mean_values and std_dev_values (m,s) to be the same for all inputs
            # s = 255/(fmax-fmin);  m = -fmin*s (the zero point)
            for i in input_arrays:
                input_stats[i] = (128., 1.275)
            converter.quantized_input_stats = input_stats

        tflite_model_buffer = converter.convert()
        tflite_output = run_tflite_graph(tflite_model_buffer, in_data)

        for device in ["llvm"]:
            ctx = tvm.context(device, 0)
            if not ctx.exist:
                print("Skip because %s is not enabled" % device)
                continue

            tvm_output = run_tvm_graph(tflite_model_buffer,
                                       in_data,
                                       in_node,
                                       target=device,
                                       num_output=len(out_names),
                                       out_names=out_names)
            if quantized:
                for i in range(len(tflite_output)):
                    # allow absolute tolerance of 1 in the quantized results
                    tvm.testing.assert_allclose(tflite_output[i],
                                                tvm_output[i],
                                                atol=1,
                                                rtol=1e-5)
            else:
                for i in range(len(tflite_output)):
                    tvm.testing.assert_allclose(tflite_output[i],
                                                tvm_output[i],
                                                atol=1e-5,
                                                rtol=1e-5)
コード例 #57
0
def test_num_thread():
    N = 1024
    M = 128

    A = te.placeholder((N, ), name='A', dtype='float32')
    B = te.compute((N, ), lambda i: A[i], name='B')

    s = te.create_schedule([B.op])
    o, i = s[B].split(s[B].op.axis[0], M)

    s[B].bind(o, te.thread_axis('threadIdx.x'))
    s[B].bind(i, te.thread_axis("threadIdx.y"))

    # shared memory usage: 0
    # thread usage: N

    for target in ['opencl', 'cuda']:
        if not tvm.context(target).exist:
            continue

        valid = [None]
        with tvm.transform.PassContext(
                config={
                    "tir.add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N - 1))]
                }):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.transform.PassContext(
                config={
                    "tir.add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N))]
                }):
            tvm.build(s, [A, B], target)
        assert valid[0]

        with tvm.transform.PassContext(
                config={
                    "tir.add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N,
                                        max_thread_y=M - 1))]
                }):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.transform.PassContext(
                config={
                    "tir.add_lower_pass": [(
                        2,
                        get_verify_pass(valid,
                                        max_shared_memory_per_block=0,
                                        max_threads_per_block=N,
                                        max_thread_y=M))]
                }):
            tvm.build(s, [A, B], target)
        assert valid[0]
コード例 #58
0
def search_common(
        task=None,
        target="llvm",
        search_policy="sketch",
        runner="local",
        num_measure_trials=100,
        cost_model=auto_scheduler.RandomModel(),
        init_search_callbacks=None,
):
    if task is None:
        task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test,
                                         args=(64, 64, 64),
                                         target=target)
    target = task.target

    print("Test search policy '%s' for '%s'" % (search_policy, target))

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        init_search_callbacks = init_search_callbacks or []
        init_search_callbacks.append(
            auto_scheduler.PreloadMeasuredStates(log_file))

        if search_policy == "empty":
            search_policy = auto_scheduler.EmptyPolicy(task)
        elif search_policy == "sketch":
            search_policy = auto_scheduler.SketchPolicy(
                task,
                program_cost_model=cost_model,
                init_search_callbacks=init_search_callbacks)
        else:
            raise ValueError("Invalid policy: " + search_policy)

        # Tune
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=num_measure_trials,
            num_measures_per_round=2,
            early_stopping=1,
            runner=runner,
            measure_callbacks=[
                auto_scheduler.RecordToFile(log_file),
                CustomMeasureCallback()
            ],
        )
        task.tune(tuning_options=tuning_options, search_policy=search_policy)

        # Compile with the best schedule
        sch, args = task.apply_best(log_file)
        mod = tvm.build(sch, args, target)

        # Compile with naive schedule for correctness check
        sch, args = task.compute_dag.apply_steps_from_state(
            task.compute_dag.init_state)
        mod_ref = tvm.build(sch, args, "llvm")

        ctx = tvm.context(str(target), 0)
        np_arrays = [
            np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
            for x in args
        ]

        tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays]
        mod(*tvm_arrays)
        actual = [x.asnumpy() for x in tvm_arrays]

        tvm_arrays = [tvm.nd.array(x) for x in np_arrays]
        mod_ref(*tvm_arrays)
        expected = [x.asnumpy() for x in tvm_arrays]

        for x, y in zip(actual, expected):
            tvm.testing.assert_allclose(x, y, rtol=1e-5)
コード例 #59
0
ファイル: conv2d_nchw_rasp.py プロジェクト: yzhliu/topi-intel
    def check_device():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')

        out_dtype = 'float32'
        wkl, sch_default = _spatial_get_sch(A, W, stride, padding, out_dtype)
        sch = sch_default if schedule is None else schedule

        a_shape = get_const_tuple(A.shape)
        w_shape = get_const_tuple(W.shape)

        dtype = A.dtype

        @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
            c_np = np.maximum(b_np, 0)
            return a_np, w_np, b_np, c_np

        a_np, w_np, b_np, c_np = get_ref_data()
        # device = 'llvm'
        device = 'llvm -mcpu=skylake-avx512'
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)

        with tvm.build_config(auto_unroll_max_step=1400,
                              unroll_explicit=(device != "cuda")):
            print('--- schedule data packing ---')
            A_vec, s = _spatial_pack_data_only(wkl, sch, A)
            print(A_vec.shape)
            a_vec_shape = get_const_tuple(A_vec.shape)
            a_vec = tvm.nd.array(np.zeros(a_vec_shape, dtype=dtype), ctx)
            print(tvm.lower(s, [A, A_vec], simple_mode=True))
            func = tvm.build(s, [A, A_vec], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a, a_vec).mean
            print('data -> data_vec: %g secs/op' % cost)

            print('--- schedule kernel packing ---')
            W_vec, s = _spatial_pack_kernel_only(wkl, sch, W)
            print(W_vec.shape)
            w_vec_shape = get_const_tuple(W_vec.shape)
            w_vec = tvm.nd.array(np.zeros(w_vec_shape, dtype=dtype), ctx)
            # print(tvm.lower(s, [W, W_vec], simple_mode=True))
            func = tvm.build(s, [W, W_vec], device)
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(w, w_vec).mean
            print('kernel -> kernel_vec: %g secs/op' % cost)

            print('--- schedule conv & unpack ---')
            A_vec = tvm.placeholder(a_vec_shape, name='A_vec')
            W_vec = tvm.placeholder(w_vec_shape, name='W_vec')
            B, s = _spatial_conv_only(wkl, sch, A_vec, W_vec, out_dtype=dtype)
            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                             ctx)
            # print(tvm.lower(s, [A_vec, W_vec, B], simple_mode=True))
            func = tvm.build(s, [A_vec, W_vec, B], target=device)
            func.save('conv_unpack.asm')
            time_f = func.time_evaluator(func.entry_name, ctx, number=2000)
            cost = time_f(a_vec, w_vec, b).mean
            print('conv & unpack: %g secs/op' % cost)

            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            print(b_np.shape)
コード例 #60
0
             (56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
             (28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
             (56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
             (28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
             (28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
             (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
             (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
             (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
             (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
             (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
             (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
             (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1)]

TARGET_NAME = 'llvm -mcpu=skylake-avx512'
NUM_VEC_LANES = 16
CTX = tvm.context(TARGET_NAME, 0)


def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
              hstride, wstride, out_dtype):
    """
    Finds out the shape of all data structures
    """
    ## Find shapes
    data_shape = (1, in_filter // NUM_VEC_LANES, im_height, im_width,
                  NUM_VEC_LANES)

    if out_dtype == 'int32':
        kernel_shape = (out_filter // NUM_VEC_LANES,
                        in_filter // NUM_VEC_LANES, k_h, k_w,
                        NUM_VEC_LANES // 4, NUM_VEC_LANES, 4)