Example #1
0
def test_local_save_load():
    if not tvm.module.enabled("opengl"):
        return
    if not tvm.module.enabled("llvm"):
        return

    n = tvm.var("n")
    A = tvm.placeholder((n,), name='A', dtype='int32')
    B = tvm.placeholder((n,), name='B', dtype='int32')
    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule(C.op)
    s[C].opengl()

    f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd")

    ctx = tvm.opengl(0)
    n = 10
    a = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(B.dtype), ctx)
    c = tvm.nd.array(np.zeros((n), dtype=C.dtype), ctx)
    f(a, b, c)

    temp = util.tempdir()
    path_so = temp.relpath("myadd.so")
    f.export_library(path_so)
    f1 = tvm.module.load(path_so)
    f1(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
Example #2
0
 def check_c():
     if not tvm.module.enabled("llvm"):
         return
     # Specifically allow offset to test codepath when offset is available
     Ab = tvm.decl_buffer(
         A.shape, A.dtype,
         elem_offset=tvm.var('Aoffset'),
         offset_factor=8,
         name='A')
     binds = {A : Ab}
     # BUILD and invoke the kernel.
     f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
     fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
     fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
     mhost = tvm.codegen.build_module(fsplits[0], "c")
     temp = util.tempdir()
     path_dso = temp.relpath("temp.so")
     mhost.export_library(path_dso)
     m = tvm.module.load(path_dso)
     fadd = m["fadd_pipeline"]
     ctx = tvm.cpu(0)
     # launch the kernel.
     n = nn
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     fadd(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), a.asnumpy() + b.asnumpy())
Example #3
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        temp = util.tempdir()
        name = "myadd_%s" % device
        if sys.platform == "darwin" or sys.platform.startswith('linux'):
            f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name)
        elif sys.platform == "win32":
            f = tvm.build(s, [A, B], device, "llvm", name=name)
        else:
            raise ValueError("Unsupported platform")

        path_dso = temp.relpath("dev_lib.so")
        f.export_library(path_dso)

        f1 = tvm.module.load(path_dso)
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        f1(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
        if sys.platform != "win32":
            f2 = tvm.module.system_lib()
            f2[name](a, b)
            np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #4
0
def test_min_repeat_ms():
    tmp = tempdir()
    filename = tmp.relpath("log")

    @tvm.register_func
    def my_debug(filename):
        """one call lasts for 100 ms and writes one character to a file"""
        time.sleep(0.1)
        with open(filename, "a") as fout:
            fout.write("c")

    X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
    s = tvm.create_schedule(X.op)
    func = tvm.build(s, [X])

    x = tvm.nd.empty((), dtype="int32")
    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
                                 number=1, repeat=1)
    ftimer(x)

    with open(filename, "r") as fin:
        ct = len(fin.readline())

    assert ct == 2


    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
                                 number=1, repeat=1, min_repeat_ms=1000)
    ftimer(x)

    # make sure we get more than 10 calls
    with open(filename, "r") as fin:
        ct = len(fin.readline())

    assert ct > 10 + 2
Example #5
0
def test_forward_inception_v1():
    '''test inception V1 model'''
    with tf.Graph().as_default():
        graph_def = nnvm.testing.tf.get_workload("InceptionV1/classify_image_graph_def-with_shapes.pb")
        # Call the utility to import the graph definition into default graph.
        graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)

        # Build an image from random data.
        from PIL import Image
        from tvm.contrib import util

        img_array = np.random.uniform(size=(1, 600, 600, 3)).astype("uint8")
        img = Image.frombuffer('RGB', (600, 600), img_array.tostring(), 'raw', 'RGB', 0, 1)
        temp = util.tempdir()
        img_path = temp.relpath("tf-test.jpg")
        img.save(img_path);

        import os.path
        if not tf.gfile.Exists(os.path.join(img_path)):
            tf.logging.fatal('File does not exist %s', image)
        data = tf.gfile.FastGFile(os.path.join(img_path), 'rb').read()

        temp.remove()

        # Extract tensorflow decoded image frame for tvm input
        with tf.Session() as sess:
            tvm_data = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'DecodeJpeg:0')

        with tf.Session() as sess:
            tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
            tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents')
            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
    def build_arm():
        target = "llvm -target=armv7-none-linux-gnueabihf"
        if not tvm.module.enabled(target):
            print("Skip because %s is not enabled.." % target)
            return
        temp = util.tempdir()
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x28)
        asm_path = temp.relpath("myadd.asm")
        f.save(asm_path)
        # Do a RPC verification, launch kernel on Arm Board if available.
        host = os.environ.get('TVM_RPC_ARM_HOST', None)
        remote = None
        if host:
            port = int(os.environ['TVM_RPC_ARM_PORT'])
            try:
                remote = rpc.connect(host, port)
            except tvm.TVMError as e:
                pass

        if remote:
            remote.upload(path)
            farm = remote.load_module("myadd.o")
            ctx = remote.cpu(0)
            n = nn
            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
            farm(a, b, c)
            tvm.testing.assert_allclose(
                c.asnumpy(), a.asnumpy() + b.asnumpy())
            print("Verification finish on remote..")
 def verify(s, check_correctness):
     mod = tvm.build(s, [data, kernel, res],
                     target_host=env.target_host,
                     name="conv2d")
     temp = util.tempdir()
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
     # verify
     ctx = remote.cpu(0)
     # Data in original format
     data_orig, kernel_orig, res_ref = get_ref_data()
     res_shape = topi.util.get_const_tuple(res.shape)
     res_np = np.zeros(res_shape).astype(res.dtype)
     data_arr = tvm.nd.array(data_orig, ctx)
     kernel_arr = tvm.nd.array(kernel_orig, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
     time_f = f.time_evaluator("conv2d", ctx, number=5)
     cost = time_f(data_arr, kernel_arr, res_arr)
     res_unpack = res_arr.asnumpy()
     if check_correctness:
         assert wl.hpad == wl.wpad
         stride = (wl.hstride, wl.wstride)
         padding = wl.hpad
         res_ref = res_ref >> 8
         res_ref = np.clip(res_ref, 0, 127).astype("int8")
         tvm.testing.assert_allclose(res_unpack, res_ref)
     return cost
Example #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'],
        help="The model type.")
    parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.")
    parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.")
    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
    parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.")
    args = parser.parse_args()

    opt_level = args.opt_level

    num_iter = args.num_iter
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size,) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(
            batch_size=1, image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))


    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(
            net, tvm.target.rasp(), shape={"data": data_shape}, params=params)

    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.o')
    lib.save(lib_fname)

    remote = rpc.connect(args.host, args.port)
    remote.upload(lib_fname)

    ctx = remote.cpu(0)
    rlib = remote.load_module('net.o')
    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

    module = runtime.create(graph, rlib, ctx)
    module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32")))
    module.set_input(**rparams)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(3):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding cpu overheat
        time.sleep(45)
Example #9
0
File: peak.py Project: gwli/tvm
def _convert_to_remote(func, remote):
    """ convert module function to remote rpc function"""
    temp = util.tempdir()
    path_dso = temp.relpath("tmp_func.tar")
    func.export_library(path_dso)

    remote.upload(path_dso)
    func = remote.load_module("tmp_func.tar")
    return func
Example #10
0
def test_variable_node_parsed():
    sym = nnvm.sym.Variable('data')
    tempdir = util.tempdir()
    json_filename = 'test_nnvm_symbol.json'
    with open(tempdir.relpath(json_filename), 'w') as fo:
        fo.write(nnvm.graph.create(sym).json())
    sym_str = open(tempdir.relpath(json_filename), 'r').read()
    sym = nnvm.graph.load_json(sym_str).symbol()
    sym = nnvm.sym.relu(sym)
Example #11
0
def generate_graph(graph_fn, params_fn, device="vta"):
    # Measure build start time
    build_start = time.time()

    # Derive the TVM target
    target = tvm.target.create("llvm -device={}".format(device))

    # Derive the LLVM compiler flags
    # When targetting the Pynq, cross-compile to ARMv7 ISA
    if env.TARGET == "sim":
        target_host = "llvm"
    elif env.TARGET == "pynq":
        target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon"

    # Load the ResNet-18 graph and parameters
    sym = nnvm.graph.load_json(open(graph_fn).read())
    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())

    # Populate the shape and data type dictionary
    shape_dict = {"data": (1, 3, 224, 224)}
    dtype_dict = {"data": 'float32'}
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
    if target.device_name == "vta":
        assert env.BLOCK_IN == env.BLOCK_OUT
        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)

    # Compile NNVM graph
    with nnvm.compiler.build_config(opt_level=3):
        if target.device_name != "vta":
            graph, lib, params = nnvm.compiler.build(
                sym, target, shape_dict, dtype_dict,
                params=params, target_host=target_host)
        else:
            with vta.build_config():
                graph, lib, params = nnvm.compiler.build(
                    sym, target, shape_dict, dtype_dict,
                    params=params, target_host=target_host)

    # Save the compiled inference graph library
    assert tvm.module.enabled("rpc")
    temp = util.tempdir()
    lib.save(temp.relpath("graphlib.o"))

    # Send the inference library over to the remote RPC server
    remote.upload(temp.relpath("graphlib.o"))
    lib = remote.load_module("graphlib.o")

    # Measure build time
    build_time = time.time() - build_start
    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))

    return graph, lib, params
Example #12
0
 def build_i386():
     if not tvm.module.enabled("llvm"):
         print("Skip because llvm is not enabled..")
         return
     temp = util.tempdir()
     target = "llvm -target=i386-pc-linux-gnu"
     f = tvm.build(s, [A, B, C], target)
     path = temp.relpath("myadd.o")
     f.save(path)
     verify_elf(path, 0x03)
Example #13
0
def test_rpc_module():
    # graph
    n = tvm.convert(1024)
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
    temp = util.tempdir()
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
    # Build the dynamic lib.
    # If we don't want to do metal and only use cpu, just set target to be target
    f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd")
    path_dso1 = temp.relpath("dev_lib.dylib")
    f.export_library(path_dso1, xcode.create_dylib,
                     arch=arch, sdk=sdk)
    xcode.codesign(path_dso1)

    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].parallel(xi)
    s[B].pragma(xo, "parallel_launch_point")
    s[B].pragma(xi, "parallel_barrier_when_finish")
    f = tvm.build(s, [A, B], target, name="myadd_cpu")
    path_dso2 = temp.relpath("cpu_lib.dylib")
    f.export_library(path_dso2, xcode.create_dylib,
                     arch=arch, sdk=sdk)
    xcode.codesign(path_dso2)

    # Start RPC test server that contains the compiled library.
    server = xcode.popen_test_rpc(proxy_host, proxy_port, key,
                                  destination=destination,
                                  libs=[path_dso1, path_dso2])

    # connect to the proxy
    remote = rpc.connect(proxy_host, proxy_port, key=key)
    ctx = remote.metal(0)
    f1 = remote.load_module("dev_lib.dylib")
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
    cost = time_f(a, b).mean
    print('%g secs/op' % cost)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
    # CPU
    ctx = remote.cpu(0)
    f2 = remote.load_module("cpu_lib.dylib")
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
    time_f = f2.time_evaluator(f1.entry_name, ctx, number=10)
    cost = time_f(a, b).mean
    print('%g secs/op' % cost)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #14
0
def try_remote_save_load():
    if not tvm.module.enabled("rpc"):
        return
    if not tvm.module.enabled("opengl"):
        return
    if not tvm.module.enabled("llvm"):
        return

    # Build the module.
    n = tvm.var("n")
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule(C.op)
    s[C].opengl()
    target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
    f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd")

    remote = rpc.connect(proxy_host, proxy_port, key="js")

    temp = util.tempdir()
    ctx = remote.opengl(0)
    path_obj = temp.relpath("myadd.bc")
    path_dso = temp.relpath("myadd.js")
    path_gl = temp.relpath("myadd.gl")
    path_json = temp.relpath("myadd.tvm_meta.json")

    f.save(path_obj)
    emscripten.create_js(path_dso, path_obj, side_module=True)
    f.imported_modules[0].save(path_gl)

    remote.upload(path_dso, "myadd.dso")
    remote.upload(path_gl)
    remote.upload(path_json)

    remote.download("myadd.dso")
    remote.download("myadd.gl")
    remote.download("myadd.tvm_meta.json")

    print('Loading myadd.dso')
    fhost = remote.load_module("myadd.dso")

    print('Loading myadd.gl')
    fdev = remote.load_module("myadd.gl")

    print('import_module')
    fhost.import_module(fdev)

    print('running...')
    a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx)
    b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx)
    c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx)
    fhost(a, b, c)
    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
Example #15
0
def test_outer_product():
    n = tvm.var('n')
    m = tvm.var('m')
    a = tvm.placeholder((n, ), name='a')
    b = tvm.placeholder((m, ), name='b')

    try:
        c = outer_product(n, m, a, b)
        ir = c.op.body
    except IOError as err:
        assert sys.version_info[0] == 2 and str(err) == 'could not get source code'
        return

    #Check for i in (0, n)
    assert isinstance(ir, tvm.stmt.For)
    assert ir.loop_var.name == 'i'
    assert ir.min.value == 0
    assert ir.extent.name == 'n'
    ibody = ir.body
    assert isinstance(ibody, tvm.stmt.For)
    #Check for j in (0, m)
    assert ibody.loop_var.name == 'j'
    assert ibody.min.value == 0
    assert ibody.extent.name == 'm'
    #Check loop body
    jbody = ibody.body
    assert isinstance(jbody, tvm.stmt.AssertStmt)
    assert isinstance(jbody.message, tvm.expr.StringImm)
    assert jbody.message.value == "index out of range!"
    jbody = jbody.body
    assert isinstance(jbody, tvm.stmt.Provide)
    assert jbody.func.name == 'c'
    assert len(jbody.args) == 2
    assert jbody.args[0].name == 'i'
    assert jbody.args[1].name == 'j'
    assert isinstance(jbody.value, tvm.expr.Mul)
    mul = jbody.value
    assert isinstance(mul.a, tvm.expr.Call)
    assert mul.a.name == 'a'
    assert mul.b.name == 'b'

    func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
    temp = util.tempdir()
    path = temp.relpath('%s.py' % func.name)
    func.save(path)
    func_ = tvm.hybrid.HybridModule()
    func_.load(path)
    run_and_check(func_, ins, {n: 99, m: 101}, outs=outs)

    for key, _ in HYBRID_GLOBALS.items():
        assert key not in globals().keys()
        assert key not in outer_product.__globals__.keys()
 def check_load_module():
     temp = util.tempdir()
     path_lib = temp.relpath("deploy.so")
     mhost.export_library(path_lib)
     with open(temp.relpath("deploy.json"), "w") as out_file:
         out_file.write(graph)
     loaded_lib = tvm.module.load(path_lib)
     loaded_graph = open(temp.relpath("deploy.json")).read()
     mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
     mod.set_input(**params)
     mod.run()
     out = mod.get_output(0, tvm.nd.empty(shape))
     np.testing.assert_equal(
         out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
Example #17
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, input_shape, out_shape = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host,
                                            shape={'data': input_shape}, dtype=dtype,
                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net, target=target, target_host=target_host,
                shape={'data': input_shape}, params=params, dtype=dtype)

        # export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number==1, repeat=30)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Example #18
0
def test_file_io():
    temp = util.tempdir()
    file_path = temp.relpath("temp.log")

    tsk, target = get_sample_task()
    inputs = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10)]
    results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]

    with open(file_path, "w") as fo:
        cb = autotvm.callback.log_to_file(fo)
        cb(None, inputs, results)

    ref = zip(inputs, results)
    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
        assert x[1] == y[1]
Example #19
0
 def check_stackvm(device):
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         print("Skip because %s is not enabled" % device)
         return
     temp = util.tempdir()
     name = "myadd_%s" % device
     f = tvm.build(s, [A, B], device, "stackvm", name=name)
     path_dso = temp.relpath("dev_lib.stackvm")
     #f.export_library(path_dso)
     #f1 = tvm.module.load(path_dso)
     a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     f(a, b)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #20
0
 def check_c():
     mhost = tvm.build(s, [A, B, C], "c", name="fadd")
     temp = util.tempdir()
     path_dso = temp.relpath("temp.so")
     mhost.export_library(path_dso)
     m = tvm.module.load(path_dso)
     fadd = m['fadd']
     ctx = tvm.cpu(0)
     # launch the kernel.
     n = nn
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     fadd(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), a.asnumpy() + b.asnumpy())
Example #21
0
    def verify_rpc(remote, target, shape, dtype):
        A = tvm.placeholder(shape, dtype=dtype)
        B = tvm.compute(A.shape, lambda i: A[i]+tvm.const(1, A.dtype))
        s = tvm.create_schedule(B.op)
        f = tvm.build(s, [A, B], target, name="myadd")

        ctx = remote.cpu(0)
        a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx)
        b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx)
        temp = util.tempdir()
        path_dso = temp.relpath("dev_lib.o")
        f.save(path_dso)
        remote.upload(path_dso)
        f = remote.load_module("dev_lib.o")
        f(a, b)
        tvm.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
Example #22
0
    def check_remote_link_cl(remote):
        """Test function to run remote code such as cl

        This is not enabled because there is forking issue
        of TVM runtime when server launches after OpenCL
        runtime initializes. We leave it as an example
        on how to do rpc when we want to do linking on remote.
        """
        if not tvm.module.enabled("llvm"):
            print("Skip because llvm is not enabled")
            return
        if not tvm.module.enabled("opencl"):
            print("Skip because opencl is not enabled")
            return
        temp = util.tempdir()
        ctx = remote.cl(0)
        s = tvm.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=32)
        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
        f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
        # Option 1: save modules separately and rely on remote compiler
        path_o = temp.relpath("myadd.o")
        path_cl = temp.relpath("myadd.cl")
        path_json = temp.relpath("myadd.tvm_meta.json")
        f.save(path_o)
        f.imported_modules[0].save(path_cl)
        remote.upload(path_o)
        remote.upload(path_cl)
        # upload meta data
        remote.upload(path_json)
        fhost = remote.load_module("myadd.o")
        fdev = remote.load_module("myadd.cl")
        fhost.import_module(fdev)
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
        # Option 2: export library as a tar ball then handled by remote compiler
        path_tar = temp.relpath("myadd.tar")
        f.export_library(path_tar)
        remote.upload(path_tar)
        fhost = remote.load_module("myadd.tar")
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #23
0
 def check_remote(remote):
     if not tvm.module.enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     temp = util.tempdir()
     ctx = remote.cpu(0)
     f = tvm.build(s, [A, B], "llvm", name="myadd")
     path_dso = temp.relpath("dev_lib.so")
     f.export_library(path_dso)
     remote.upload(path_dso)
     f1 = remote.load_module("dev_lib.so")
     a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
     cost = time_f(a, b).mean
     print('%g secs/op' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #24
0
def gemv_impl():
    cc_code = """
      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
        for (int i = 0; i < m; ++i) {
            for (int j = 0; j < l; ++j) {
                cc[i] += aa[j] * bb[i * stride + j];
            }
        }
        return 0;
      }
    """
    from tvm.contrib import util, clang
    temp = util.tempdir()
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(cc_code, output=ll_path)
    return ll_code
        def verify(s, check_correctness):
            mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
                            env.target_host, name="conv2d")
            temp = util.tempdir()

            mod.save(temp.relpath("conv2d.o"))
            remote.upload(temp.relpath("conv2d.o"))
            f = remote.load_module("conv2d.o")
            # verify
            ctx = remote.ext_dev(0)
            # Data in original format
            data_orig, kernel_orig, res_ref = get_ref_data()
            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32")
            bias_orig = np.abs(bias_orig)

            data_packed = data_orig.reshape(
                batch_size//env.BATCH, env.BATCH,
                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
                wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
            kernel_packed = kernel_orig.reshape(
                wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
                wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
            bias_packed = bias_orig.reshape(
                1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
            res_shape = topi.util.get_const_tuple(res.shape)

            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_packed, ctx)
            kernel_arr = tvm.nd.array(kernel_packed, ctx)
            bias_arr = tvm.nd.array(bias_packed, ctx)
            res_arr = tvm.nd.array(res_np, ctx)
            time_f = f.time_evaluator("conv2d", ctx, number=5)
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            res_unpack = res_arr.asnumpy().transpose(
                (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width)
            if check_correctness:
                assert wl.hpad == wl.wpad
                stride = (wl.hstride, wl.wstride)
                padding = wl.hpad
                res_ref = res_ref >> 8
                res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
                res_ref = np.clip(res_ref, 0, 127).astype("int8")
                tvm.testing.assert_allclose(res_unpack, res_ref)
            return cost
Example #26
0
def test_dso_module_load():
    if not tvm.module.enabled("llvm"):
        return
    dtype = 'int64'
    temp = util.tempdir()

    def save_object(names):
        n = tvm.var('n')
        Ab = tvm.decl_buffer((n, ), dtype)
        i = tvm.var('i')
        # for i in 0 to n-1:
        stmt = tvm.make.For(
            i, 0, n - 1, 0, 0,
            tvm.make.Store(Ab.data,
                           tvm.make.Load(dtype, Ab.data, i) + 1,
                           i + 1))
        fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
        fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
        m = tvm.codegen.build_module(fapi, "llvm")
        for name in names:
            m.save(name)

    path_obj = temp.relpath("test.o")
    path_ll = temp.relpath("test.ll")
    path_bc = temp.relpath("test.bc")
    path_dso = temp.relpath("test.so")
    save_object([path_obj, path_ll, path_bc])
    cc.create_shared(path_dso, [path_obj])

    f1 = tvm.module.load(path_dso)
    f2 = tvm.module.load(path_ll)
    a = tvm.nd.array(np.zeros(10, dtype=dtype))
    f1(a)
    np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0]))
    a = tvm.nd.array(np.zeros(10, dtype=dtype))
    f2(a)
    np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0]))

    path_runtime_py = temp.relpath("runtime.py")
    with open(path_runtime_py, "w") as fo:
        fo.write(runtime_py)

    subprocess.check_call(
        "python %s %s %s" % (path_runtime_py, path_dso, dtype),
        shell=True)
Example #27
0
 def check_remote():
     if not tvm.module.enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     mlib = tvm.build(s, [A, B], "llvm", name="myadd")
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
     temp = util.tempdir()
     ctx = remote.cpu(0)
     path_dso = temp.relpath("dev_lib.so")
     mlib.export_library(path_dso)
     remote.upload(path_dso)
     mlib = remote.load_module("dev_lib.so")
     mod = graph_runtime.create(graph, mlib, remote.cpu(0))
     a = np.random.uniform(size=(n,)).astype(A.dtype)
     mod.run(x=tvm.nd.array(a, ctx))
     out = tvm.nd.empty((n,), ctx=ctx)
     out = mod.get_output(0, out)
     np.testing.assert_equal(out.asnumpy(), a + 1)
Example #28
0
 def verify(s, check_correctness=True):
     mod = vta.build(s, [data, weight, res],
                     "ext_dev", env.target_host, name="gemm")
     temp = util.tempdir()
     mod.save(temp.relpath("gemm.o"))
     remote.upload(temp.relpath("gemm.o"))
     f = remote.load_module("gemm.o")
     # verify
     ctx = remote.ext_dev(0)
     # Data in original format
     data_orig = np.random.randint(
         -128, 128, size=(batch_size, channel)).astype(data.dtype)
     weight_orig = np.random.randint(
         -128, 128, size=(channel, channel)).astype(weight.dtype)
     data_packed = data_orig.reshape(
         batch_size // env.BATCH, env.BATCH,
         channel // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
     weight_packed = weight_orig.reshape(
         channel // env.BLOCK_OUT, env.BLOCK_OUT,
         channel // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
     res_np = np.zeros(res_shape).astype(res.dtype)
     data_arr = tvm.nd.array(data_packed, ctx)
     weight_arr = tvm.nd.array(weight_packed, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
     res_ref = np.zeros(res_shape).astype(env.acc_dtype)
     for b in range(batch_size // env.BATCH):
         for i in range(channel // env.BLOCK_OUT):
             for j in range(channel // env.BLOCK_IN):
                 res_ref[b,i,:] += np.dot(data_packed[b,j,:].astype(env.acc_dtype),
                                          weight_packed[i,j].T.astype(env.acc_dtype))
     res_ref = np.right_shift(res_ref, 8)
     res_ref = np.clip(res_ref, 0, (1<<(env.INP_WIDTH-1))-1).astype(res.dtype)
     time_f = f.time_evaluator("gemm", ctx, number=20)
     cost = time_f(data_arr, weight_arr, res_arr)
     res_unpack = res_arr.asnumpy().reshape(batch_size // env.BATCH,
                                            channel // env.BLOCK_OUT,
                                            env.BATCH,
                                            env.BLOCK_OUT)
     if check_correctness:
         tvm.testing.assert_allclose(res_unpack, res_ref)
     return cost
Example #29
0
    def verify_graph_runtime(remote, target, shape, dtype):
        x = relay.var('x')
        y = relay.const(1)
        z = relay.add(x, y)
        func = relay.Function([x], z)

        x_in = np.ones(shape).astype(dtype)
        params = {'x': x_in}
        graph, lib, params = relay.build(func, target=target, params=params)

        temp = util.tempdir()
        path_dso = temp.relpath("dev_lib.o")
        lib.save(path_dso)
        remote.upload(path_dso)
        lib = remote.load_module("dev_lib.o")
        ctx = remote.cpu(0)
        mod = graph_runtime.create(graph, lib, ctx)
        mod.load_params(relay.save_param_dict(params))
        mod.run()
        out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
        tvm.testing.assert_allclose(x_in + 1, out.asnumpy())
Example #30
0
 def check_remote():
     if not tvm.module.enabled(target):
         print("Skip because %s is not enabled" % target)
         return
     temp = util.tempdir()
     ctx = remote.cpu(0)
     f = tvm.build(s, [A, B], target, name="myadd")
     path_obj = temp.relpath("dev_lib.bc")
     path_dso = temp.relpath("dev_lib.js")
     f.save(path_obj)
     emscripten.create_js(path_dso, path_obj, side_module=True)
     # Upload to suffix as dso so it can be loaded remotely
     remote.upload(path_dso, "dev_lib.dso")
     data = remote.download("dev_lib.dso")
     f1 = remote.load_module("dev_lib.dso")
     a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
     cost = time_f(a, b).mean
     print('%g secs/op' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #31
0
    # connect to remote device
    tracker = tvm.rpc.connect_tracker(args.host, args.port)
    remote = tracker.request(args.rpc_key)

    print("--------------------------------------------------")
    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
    print("--------------------------------------------------")
    for network in networks:
        net, params, input_shape, output_shape = get_network(network, batch_size=1)

        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
            graph, lib, params = nnvm.compiler.build(
                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)

        tmp = tempdir()
        if 'android' in str(target):
            from tvm.contrib import ndk
            filename = "%s.so" % network
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "%s.tar" % network
            lib.export_library(tmp.relpath(filename))

        # upload library and params
        ctx = remote.context(str(target), 0)
        remote.upload(tmp.relpath(filename))
        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

        rlib = remote.load_module(filename)
        module = runtime.create(graph, rlib, ctx)
def run_case(dtype, image):
    # Check image
    import os
    import json
    import sys

    STAT_REPEAT=os.environ.get('STAT_REPEAT','')
    if STAT_REPEAT=='' or STAT_REPEAT==None:
       STAT_REPEAT=10
    STAT_REPEAT=int(STAT_REPEAT)

    # FGG: set model files via CK env
    CATEG_FILE = '../synset.txt'
    synset = eval(open(os.path.join(CATEG_FILE)).read())

    files=[]
    val={}

    if image!=None and image!='':
       files=[image]
    else:
       ipath=os.environ.get('CK_ENV_DATASET_IMAGENET_VAL','')
       if ipath=='':
          print ('Error: path to ImageNet dataset is not set!')
          exit(1)
       if not os.path.isdir(ipath):
          print ('Error: path to ImageNet dataset was not found!')
          exit(1)

       # get all files
       d=os.listdir(ipath)
       for x in d:
           x1=x.lower()
           if x1.startswith('ilsvrc2012_val_'):
              files.append(os.path.join(ipath,x))

       files=sorted(files)

       STAT_REPEAT=1

       # Get correct labels
       ival=os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT','')
       fval=open(ival).read().split('\n')

       val={}
       for x in fval:
           x=x.strip()
           if x!='':
              y=x.split(' ')
              val[y[0]]=int(y[1])

    # FGG: set timers
    import time
    timers={}

    # Get first shape (expect that will be the same for all)
    dt=time.time()
    image = Image.open(os.path.join(files[0])).resize((224, 224))
    if image.mode!='RGB': image=image.convert('RGB')
    timers['execution_time_load_image']=time.time()-dt

    dt=time.time()
    img = transform_image(image)
    timers['execution_time_transform_image']=time.time()-dt

    # load model
    from mxnet.gluon.model_zoo.vision import get_model
    from mxnet.gluon.utils import download

    model_path=os.environ['CK_ENV_MODEL_MXNET']
    model_id=os.environ['MXNET_MODEL_ID']
    block = get_model(model_id, pretrained=True, root=model_path)

    # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
    net, params = nnvm.frontend.from_mxnet(block)
    # we want a probability so add a softmax operator
    net = nnvm.sym.softmax(net)

    # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3)
    if dtype!='float32':
       params = {k: tvm.nd.array(v.asnumpy().astype(dtype)) for k, v in params.items()}

    # compile
    opt_level = 2 if dtype == 'float32' else 1
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(
            net, tvm.target.mali(), shape={"data": data_shape}, params=params,
            dtype=dtype, target_host=None)

    # upload model to remote device
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    ctx = tvm.cl(0)
    rlib = lib
    rparams = params

    # create graph runtime
    dt=time.time()
    module = runtime.create(graph, rlib, ctx)
    module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype)))
    module.set_input(**rparams)
    timers['execution_time_create_run_time_graph']=(time.time()-dt)

    total_images=0
    correct_images_top1=0
    correct_images_top5=0

    # Shuffle files and pre-read JSON with accuracy to continue aggregating it
    # otherwise if FPGA board hangs, we can continue checking random images ...

    import random
    random.shuffle(files)

    if len(files)>1 and os.path.isfile('aggregate-ck-timer.json'):
       x=json.load(open('aggregate-ck-timer.json'))

       if 'total_images' in x:
          total_images=x['total_images']
       if 'correct_images_top1' in x:
          correct_images_top1=x['correct_images_top1']
       if 'correct_images_top5' in x:
          correct_images_top5=x['correct_images_top5']

    dt1=time.time()
    for f in files:
        total_images+=1

        print ('===============================================================================')
        print ('Image '+str(total_images)+' of '+str(len(files))+' : '+f)

        image = Image.open(os.path.join(f)).resize((224, 224))
        if image.mode!='RGB': image=image.convert('RGB')
        img = transform_image(image)

        # set inputs
        module.set_input('data', tvm.nd.array(img.astype(dtype)))
        module.set_input(**rparams)

        # perform some warm up runs
        # print("warm up..")
        warm_up_timer = module.module.time_evaluator("run", ctx, 1)
        warm_up_timer()

        # execute
        print ('')
        print ("run ("+str(STAT_REPEAT)+" statistical repetitions)")
        dt=time.time()
        timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT)
        tcost = timer()
        timers['execution_time_classify']=(time.time()-dt)/STAT_REPEAT

        # get outputs
        tvm_output = module.get_output(
            0, tvm.nd.empty((1000,), dtype, ctx))

        top1 = np.argmax(tvm_output.asnumpy())

        top5=[]
        atop5 = get_top5(tvm_output.asnumpy())

        print ('')
        print('TVM prediction Top1:', top1, synset[top1])

        print ('')
        print('TVM prediction Top5:')
        for q in atop5:
            x=q[1]
            y=synset[x]
            top5.append(x)
            print (x,y)

        print ('')
        print("Internal T-cost: %g" % tcost.mean)

        # Check correctness if available
        if len(val)>0:
           top=val[os.path.basename(f)]

           correct_top1=False
           if top==top1:
              correct_top1=True
              correct_images_top1+=1

           print ('')
           if correct_top1:
              print ('Current prediction Top1: CORRECT')
           else:
              print ('Current prediction Top1: INCORRECT +('+str(top)+')')

           accuracy_top1=float(correct_images_top1)/float(total_images)
           print ('Current accuracy Top1:   '+('%.5f'%accuracy_top1))

           correct_top5=False
           if top in top5:
              correct_top5=True
              correct_images_top5+=1

           print ('')
           if correct_top5:
              print ('Current prediction Top5: CORRECT')
           else:
              print ('Current prediction Top5: INCORRECT +('+str(top)+')')

           accuracy_top5=float(correct_images_top5)/float(total_images)
           print ('Current accuracy Top5:   '+('%.5f'%accuracy_top5))

           print ('')
           print ('Total elapsed time: '+('%.1f'%(time.time()-dt1))+' sec.')

           timers['total_images']=total_images
           timers['correct_images_top1']=correct_images_top1
           timers['accuracy_top1']=accuracy_top1
           timers['correct_images_top5']=correct_images_top5
           timers['accuracy_top5']=accuracy_top5

        timers['execution_time_classify_internal']=tcost.mean
        timers['execution_time']=tcost.mean

        with open ('tmp-ck-timer.json', 'w') as ftimers:
             json.dump(timers, ftimers, indent=2)

        with open ('aggregate-ck-timer.json', 'w') as ftimers:
             json.dump(timers, ftimers, indent=2)

        sys.stdout.flush()

    return
Example #33
0
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from tvm import relay
from tvm.relay import testing
import tvm
from tvm import te

from tvm.contrib import util
header_file_dir_path = util.tempdir()


def gen_engine_header():
    code = r'''
        #ifndef _ENGINE_H_
        #define _ENGINE_H_
        #include <cstdint>
        #include <string>
        #include <sstream>
        #include <vector>
        class Engine {
        };

        #endif
        '''
Example #34
0
 def __init__(self):
     # pylint: disable=super-init-not-called
     self.context = nd.context
     self.get_function = tvm._ffi.get_global_func
     self._temp = util.tempdir()
Example #35
0
def run_gemm(env, remote, target,
             batch_size, in_feat, out_feat,
             check_correctness=True, print_ir=True,
             samples=4):

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
    elif "vta" in target.keys:
        data_pack = True

    # Derive shapes depending upon packing
    a_shape = (batch_size, in_feat)
    w_shape = (out_feat, in_feat)
    if data_pack:
        data_shape = (batch_size//env.BATCH, in_feat//env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN,
                        env.BLOCK_OUT, env.BLOCK_IN)
        fcompute = vta.top.dense_packed
        fschedule = vta.top.schedule_dense_packed
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        fcompute = topi.x86.dense_nopack
        fschedule = topi.x86.schedule_dense_nopack
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = fcompute(
            data, kernel, None, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    num_ops = 2 * batch_size * in_feat * out_feat

    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
    def get_ref_data():
        # derive min max for act, wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)

        r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            batch_size//env.BATCH, env.BATCH,
            in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
        kernel_np = kernel_np.reshape(
            out_feat//env.BLOCK_OUT, env.BLOCK_OUT,
            in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    temp = util.tempdir()
    mod.save(temp.relpath("dense.o"))
    remote.upload(temp.relpath("dense.o"))
    f = remote.load_module("dense.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("dense", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.reshape(batch_size, out_feat)
        res_ref = res_ref >> 8
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10 ** 9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

    return correct, cost, stats
Example #36
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from relay program
    print("Extract tasks...")
    mod, params, input_shape, _ = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_program(mod["main"],
                                              target=target,
                                              target_host=target_host,
                                              params=params,
                                              ops=(relay.op.nn.conv2d, ))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params, target_host=target_host)
        # export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "{}.so".format(module_export_prefix)
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "{}.tar".format(module_export_prefix)
            lib.export_library(tmp.relpath(filename))

        lib.imported_modules[0].save(
            "{}-cuda.ptx".format(module_export_prefix))
        lib.export_library("{}-lib.tar".format(module_export_prefix))

        with open("{}-graph.json".format(module_export_prefix), "w") as fo:
            fo.write(graph)

        with open("{}-params.params".format(module_export_prefix), "wb") as fo:
            fo.write(relay.save_param_dict(params))

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key,
                                                tracker_host,
                                                tracker_port,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array(
            (np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
def test_rpc_module():
    # graph
    n = tvm.convert(1024)
    A = tvm.placeholder((n, ), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    temp = util.tempdir()

    # Establish remote connection with target hardware
    tracker = rpc.connect_tracker(tracker_host, tracker_port)
    remote = tracker.request(key, priority=0, session_timeout=60)

    # Compile the Graph for CPU target
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].parallel(xi)
    s[B].pragma(xo, "parallel_launch_point")
    s[B].pragma(xi, "parallel_barrier_when_finish")
    f = tvm.build(s, [A, B], target, name="myadd_cpu")
    path_dso_cpu = temp.relpath("cpu_lib.so")
    f.export_library(path_dso_cpu, ndk.create_shared)

    # Execute the portable graph on cpu target
    print('Run CPU test ...')
    ctx = remote.cpu(0)
    remote.upload(path_dso_cpu)
    f2 = remote.load_module("cpu_lib.so")
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
    time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
    cost = time_f(a, b).mean
    print('%g secs/op\n' % cost)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    # Compile the Graph for OpenCL target
    if test_opencl:
        s = tvm.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=64)
        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
        # Build the dynamic lib.
        # If we don't want to do metal and only use cpu, just set target to be target
        f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
        path_dso_cl = temp.relpath("dev_lib_cl.so")
        f.export_library(path_dso_cl, ndk.create_shared)

        print('Run GPU(OpenCL Flavor) test ...')
        ctx = remote.cl(0)
        remote.upload(path_dso_cl)
        f1 = remote.load_module("dev_lib_cl.so")
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
        cost = time_f(a, b).mean
        print('%g secs/op\n' % cost)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    # Compile the Graph for Vulkan target
    if test_vulkan:
        s = tvm.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=64)
        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
        # Build the dynamic lib.
        # If we don't want to do metal and only use cpu, just set target to be target
        f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
        f.export_library(path_dso_vulkan, ndk.create_shared)

        print('Run GPU(Vulkan Flavor) test ...')
        ctx = remote.vulkan(0)
        remote.upload(path_dso_vulkan)
        f1 = remote.load_module("dev_lib_vulkan.so")
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
        cost = time_f(a, b).mean
        print('%g secs/op\n' % cost)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Example #38
0
def test_one_time(one_time_length=1000,
                  Test_sparse=True,
                  image_shape=(3, 32, 32)):
    # Hyper-parameter define
    batch_size = 1
    num_class = 10
    data_shape = (batch_size, ) + image_shape
    out_shape = (batch_size, num_class)
    sparse_kernel_shape = (batch_size, 12)
    dtype = "float32"

    data = sym.Variable("data")
    sparse_kernel = sym.Variable("sparse_kernel",
                                 init=np.random.randint(
                                     0, 2, sparse_kernel_shape).astype(dtype))
    if Test_sparse:
        y1 = sym.conv2d_sparse(data=data,
                               sparsity=sparse_kernel,
                               channels=12,
                               kernel_size=(3, 3),
                               padding=(0, 0),
                               use_bias=False,
                               out_layout='NCHW')
    else:
        y1 = sym.conv2d(data=data,
                        channels=10,
                        kernel_size=(3, 3),
                        padding=(0, 0),
                        use_bias=False,
                        out_layout='NCHW')
    # y = sym.flatten(y1)
    # y = sym.dense(y, units=10, use_bias=False)
    # y = sym.softmax(y)
    out = y1

    # Test Graph compilation
    # Once the API is well-defined, this part will be OK
    # g = graph.create(out)
    # print("-------------Starts----------------")
    # print(g.json())
    # print("-----------------------------------")
    # print(g.ir())
    # print("--------------Ends-----------------")

    # Create workload
    net, params = create_sparse_workload(out, batch_size, image_shape, dtype)
    # print("-------------Starts2---------------")
    # print(net.debug_str())
    # print(params)
    # print("--------------Ends2----------------")

    # Test Forward
    # NNVM-compiler build
    opt_level = 0
    target = tvm.target.mali()
    target_host = "llvm -target=aarch64-linux-gnu"
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(net,
                                                 target=target,
                                                 shape={"data": data_shape},
                                                 params=params,
                                                 target_host=target_host)

    tmp = util.tempdir()
    lib_fname = tmp.relpath("net.tar")
    lib.export_library(lib_fname)
    remote = rpc.connect('59.78.6.204', 9090)
    remote.upload(lib_fname)
    rlib = remote.load_module("net.tar")

    ctx = remote.cl(0)

    # create random input
    real_data = np.random.uniform(-1, 1, size=data_shape).astype(dtype)
    real_sparse_kernel = np.array(([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
                                     1]])).astype(dtype)
    # real_sparse_kernel = np.random.randint(0, 2, sparse_kernel_shape).astype(dtype)

    # print(real_data)
    # print(real_sparse_kernel)

    # create module
    module = graph_runtime.create(graph, rlib, ctx)
    # set input and parameters
    module.set_input("data", real_data)
    if Test_sparse:
        module.set_input("sparse_kernel", real_sparse_kernel)
        module.set_input(**params)

    # run
    # localtime = time.asctime(time.localtime(time.time()))
    # print("Start time:" + localtime)
    starttime = time.time()
    for _ in range(one_time_length):
        module.run()
    endtime = time.time()
    # localtime = time.asctime(time.localtime(time.time()))
    # print("End time:" + localtime)
    print(endtime - starttime)

    # get output
    out = module.get_output(0)
    # convert to numpy
    out.asnumpy()

    # Print first 10 elements of output
    # print("-------------Starts3---------------")
    # # print(out.asnumpy().flatten()[0:10])
    # print(out)
    # print("--------------Ends3----------------")

    return endtime - starttime
Example #39
0
 def __init__(self, model_dir):
     from tensorflow.core.framework import graph_pb2
     self._tmp_dir = util.tempdir()
     self._model_dir = model_dir
     self._graph = graph_pb2.GraphDef()
    def _run(env, remote):
        m = 2
        n = 8
        imm_shift = np.random.randint(0, 8)
        imm_scale = np.random.randint(1, 5)
        # compute
        a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                            name="a",
                            dtype=env.acc_dtype)
        a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                            "a_buf")  # DRAM->SRAM
        res_shift = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: a_buf(*i) + imm_shift,
                                "res_shift")  # compute
        res_scale = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: res_shift(*i) >> imm_scale,
                                "res_scale")  # compute
        res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                          lambda *i: res_scale(*i).astype(env.inp_dtype),
                          "res")  # SRAM->DRAM
        # schedule
        s = tvm.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[res_shift].set_scope(env.acc_scope)  # SRAM
        s[res_scale].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[res_shift].pragma(res_shift.op.axis[0], env.alu)  # compute
        s[res_scale].pragma(res_scale.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        a_np = np.random.randint(-10,
                                 10,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.right_shift((a_np + imm_shift), imm_scale)
        res_np = res_np.astype(res.dtype)
        a_nd = tvm.nd.array(a_np, ctx)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.asnumpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Shift and scale execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Example #41
0
def gemv_quantized_impl(M, N, data_type='uint8'):
    """ Assembly implementation of a blocked gemv. Given
    a block a of shape (4, k) and a block b' of shape (4, k)
    produces the output block c = a*b of shape (4,4) """

    stepA = min(4, M)
    stepB = min(4, N)
    assert data_type in ['uint8', 'int8'
                         ], 'Only uint8/int8 supported for this implementation'

    cc_code = """
          extern "C" int gemv_{0}_{0}_int32_{1}_{2}(int *c_buffer,
                                                    unsigned char *a_buffer,
                                                    unsigned char *b_buffer,
                                                    int K, int m, int n)
              """.format(data_type, stepA, stepB)

    cc_code += """
    {
            unsigned char * a_ptr = a_buffer;
            unsigned char * b_ptr = b_buffer;
            int * c_ptr = c_buffer;

            int k = K / 16;

            __asm__  __volatile__ (
                "movi v16.4s, #0\\n"
                "movi v17.4s, #0\\n"
                "movi v18.4s, #0\\n"
                "movi v19.4s, #0\\n"
                "movi v20.4s, #0\\n"
                "movi v21.4s, #0\\n"
                "movi v22.4s, #0\\n"
                "movi v23.4s, #0\\n"
                "movi v24.4s, #0\\n"
                "movi v25.4s, #0\\n"
                "movi v26.4s, #0\\n"
                "movi v27.4s, #0\\n"
                "movi v28.4s, #0\\n"
                "movi v29.4s, #0\\n"
                "movi v30.4s, #0\\n"
                "movi v31.4s, #0\\n"
            "1:"
    """

    cc_code += ' "ldr q0, [%[a_ptr]]\\n" '

    if M > 1:
        cc_code += ' "ldr q1, [%[a_ptr], #16]\\n" '
    else:
        cc_code += ' "movi v1.4s, #0\\n" '

    if M > 2:
        cc_code += ' "ldr q2, [%[a_ptr], #32]\\n" '
    else:
        cc_code += ' "movi v2.4s, #0\\n" '

    if M > 3:
        cc_code += ' "ldr q3, [%[a_ptr], #48]\\n" '
    else:
        cc_code += ' "movi v3.4s, #0\\n" '

    cc_code += ' "ldr q4, [%[b_ptr]]\\n" '

    if N > 1:
        cc_code += ' "ldr q5, [%[b_ptr], #16]\\n" '

    if N > 2:
        cc_code += ' "ldr q6, [%[b_ptr], #32]\\n" '

    if N > 3:
        cc_code += ' "ldr q7, [%[b_ptr], #48]\\n" '

    cc_code += """
                // First half
                // Higher part of a0 * {b0,b1,b2,b3}
                "umull v8.8h, v0.8b, v4.8b\\n"
                "umull v9.8h, v0.8b, v5.8b\\n"
                "umull v10.8h, v0.8b, v6.8b\\n"
                "umull v11.8h, v0.8b, v7.8b\\n"

                // Higher part of a1 * {b0,b1,b2,b3}
                "umull v12.8h, v1.8b, v4.8b\\n"
                "umull v13.8h, v1.8b, v5.8b\\n"
                "umull v14.8h, v1.8b, v6.8b\\n"
                "umull v15.8h, v1.8b, v7.8b\\n"

                // Accumulate
                "uadalp v16.4s, v8.8h\\n"
                "uadalp v17.4s, v9.8h\\n"
                "uadalp v18.4s, v10.8h\\n"
                "uadalp v19.4s, v11.8h\\n"
                "uadalp v20.4s, v12.8h\\n"
                "uadalp v21.4s, v13.8h\\n"
                "uadalp v22.4s, v14.8h\\n"
                "uadalp v23.4s, v15.8h\\n"

                // Lower part of a0 * {b0,b1,b2,b3}
                "umull2 v8.8h, v0.16b, v4.16b\\n"
                "umull2 v9.8h, v0.16b, v5.16b\\n"
                "umull2 v10.8h, v0.16b, v6.16b\\n"
                "umull2 v11.8h, v0.16b, v7.16b\\n"

                // Lower part of a1 * {b0,b1,b2,b3}
                "umull2 v12.8h, v1.16b, v4.16b\\n"
                "umull2 v13.8h, v1.16b, v5.16b\\n"
                "umull2 v14.8h, v1.16b, v6.16b\\n"
                "umull2 v15.8h, v1.16b, v7.16b\\n"

                 // Accumulate again
                "uadalp v16.4s, v8.8h\\n"
                "uadalp v17.4s, v9.8h\\n"
                "uadalp v18.4s, v10.8h\\n"
                "uadalp v19.4s, v11.8h\\n"
                "uadalp v20.4s, v12.8h\\n"
                "uadalp v21.4s, v13.8h\\n"
                "uadalp v22.4s, v14.8h\\n"
                "uadalp v23.4s, v15.8h\\n"

                // Second half

                // Lower part of a2 * {b0,b1,b2,b3}
                "umull v8.8h, v2.8b, v4.8b\\n"
                "umull v9.8h, v2.8b, v5.8b\\n"
                "umull v10.8h, v2.8b, v6.8b\\n"
                "umull v11.8h, v2.8b, v7.8b\\n"

                // Lower part of a3 * {b0,b1,b2,b3}
                "umull v12.8h, v3.8b, v4.8b\\n"
                "umull v13.8h, v3.8b, v5.8b\\n"
                "umull v14.8h, v3.8b, v6.8b\\n"
                "umull v15.8h, v3.8b, v7.8b\\n"

                // Accumulate
                "uadalp v24.4s, v8.8h\\n"
                "uadalp v25.4s, v9.8h\\n"
                "uadalp v26.4s, v10.8h\\n"
                "uadalp v27.4s, v11.8h\\n"
                "uadalp v28.4s, v12.8h\\n"
                "uadalp v29.4s, v13.8h\\n"
                "uadalp v30.4s, v14.8h\\n"
                "uadalp v31.4s, v15.8h\\n"

                // Higher part of a2 * {b0,b1,b2,b3}
                "umull2 v8.8h, v2.16b, v4.16b\\n"
                "umull2 v9.8h, v2.16b, v5.16b\\n"
                "umull2 v10.8h, v2.16b, v6.16b\\n"
                "umull2 v11.8h, v2.16b, v7.16b\\n"

                // Higher part of a3 * {b0,b1,b2,b3}
                "umull2 v12.8h, v3.16b, v4.16b\\n"
                "umull2 v13.8h, v3.16b, v5.16b\\n"
                "umull2 v14.8h, v3.16b, v6.16b\\n"
                "umull2 v15.8h, v3.16b, v7.16b\\n"

                // Accumulate again
                "uadalp v24.4s, v8.8h\\n"
                "uadalp v25.4s, v9.8h\\n"
                "uadalp v26.4s, v10.8h\\n"
                "uadalp v27.4s, v11.8h\\n"
                "uadalp v28.4s, v12.8h\\n"
                "uadalp v29.4s, v13.8h\\n"
                "uadalp v30.4s, v14.8h\\n"
                "uadalp v31.4s, v15.8h\\n"
    """
    blockA = min(64, M * 16)
    blockB = min(64, N * 16)

    cc_code += """
                // Increment pointers and decrement k
                "add %[a_ptr], %[a_ptr], #{0}\\n"
                "add %[b_ptr], %[b_ptr], #{1}\\n"
                "subs %w[k], %w[k], #1\\n"
    """.format(blockA, blockB)

    stepC = min(4, N)

    cc_code += """
                "cbnz %w[k], 1b\\n"

                // Final additions

                // v16 contains the four partial sums of a[0, 0:K].*b[0,0:K], let's call them (a,b,c,d)
                // v17 contains the four partial sums of a[0, 0:K].*b[1,0:K], let's call them (e,f,g,h)
                // v18 contains the four partial sums of a[0, 0:K].*b[2,0:K], let's call them (i,j,k,l)
                // v19 contains the four partial sums of a[0, 0:K].*b[3,0:K], let's call them (m,n,o,p)
                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b, c+d, e+f, g+h)
                "addp v17.4s, v18.4s, v19.4s\\n" // v17 = (i+j, k+l, m+n, o+p)
                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)

                // v20 contains the four partial sums of a[1, 0:K].*b[0,0:K], let's call them (a,b,c,d)
                // v21 contains the four partial sums of a[1, 0:K].*b[1,0:K], let's call them (e,f,g,h)
                // v22 contains the four partial sums of a[1, 0:K].*b[2,0:K], let's call them (i,j,k,l)
                // v23 contains the four partial sums of a[1, 0:K].*b[3,0:K], let's call them (m,n,o,p)
                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b, c+d, e+f, g+h)
                "addp v21.4s, v22.4s, v23.4s\\n" // v21 = (i+j, k+l, m+n, o+p)
                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)

                // v24 contains the four partial sums of a[2, 0:K].*b[0,0:K], let's call them (a,b,c,d)
                // v25 contains the four partial sums of a[2, 0:K].*b[1,0:K], let's call them (e,f,g,h)
                // v26 contains the four partial sums of a[2, 0:K].*b[2,0:K], let's call them (i,j,k,l)
                // v27 contains the four partial sums of a[2, 0:K].*b[3,0:K], let's call them (m,n,o,p)
                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b, c+d, e+f, g+h)
                "addp v25.4s, v26.4s, v27.4s\\n"  // v25 = (i+j, k+l, m+n, o+p)
                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)

                // v28 contains the four partial sums of a[3, 0:K].*b[0,0:K], let's call them (a,b,c,d)
                // v29 contains the four partial sums of a[3, 0:K].*b[1,0:K], let's call them (e,f,g,h)
                // v30 contains the four partial sums of a[3, 0:K].*b[2,0:K], let's call them (i,j,k,l)
                // v31 contains the four partial sums of a[3, 0:K].*b[3,0:K], let's call them (m,n,o,p)
                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b, c+d, e+f, g+h)
                "addp v29.4s, v30.4s, v31.4s\\n" // v29 = (i+j, k+l, m+n, o+p)
                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)

                "str q16, [%[c_ptr]]\\n"
            """

    if M > 1:
        cc_code += ' "str q20, [%[c_ptr], #{0}]\\n" '.format(stepC * 4)

    if M > 2:
        cc_code += ' "str q24, [%[c_ptr], #{0}]\\n" '.format(stepC * 8)

    if M > 3:
        cc_code += ' "str q28, [%[c_ptr], #{0}]\\n" '.format(stepC * 12)

    cc_code += """
             : [c_ptr] "+r" (c_ptr), [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [k] "+r" (k)
             :
             : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
                    "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
                    "v27", "v28", "v29", "v30", "v31"
             );
        return 0;
        }
    """

    if data_type == 'int8':
        cc_code = cc_code.replace('unsigned char', 'char')
        cc_code = cc_code.replace('umull', 'smull')
        cc_code = cc_code.replace('uadalp', 'sadalp')

    temp = util.tempdir()
    ll_path = temp.relpath("temp.ll")
    # Create LLVM ir from c source code
    ll_code = clang.create_llvm(
        cc_code,
        options=["-mtriple=aarch64-linux-gnu -mattr=+neon"],
        output=ll_path)
    return ll_code
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                            name="a",
                            dtype=env.acc_dtype)
        a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                            "a_buf")  # DRAM->SRAM
        max_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: tvm.max(a_buf(*i), 0),
                              "res_buf")  # relu
        min_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: tvm.min(max_buf(*i),
                                                 (1 <<
                                                  (env.INP_WIDTH - 1)) - 1),
                              "max_buf")  # relu
        res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                          lambda *i: min_buf(*i).astype(env.inp_dtype),
                          "min_buf")  # SRAM->DRAM
        # schedule
        s = tvm.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        a_np = np.random.randint(-256,
                                 256,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 <<
                                   (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, ctx)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.asnumpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Relu execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Example #43
0
def create_micro_lib_base(out_obj_path,
                          in_src_path,
                          toolchain_prefix,
                          device_id,
                          lib_type,
                          options=None):
    """Compiles code into a binary for the target micro device.

    Parameters
    ----------
    out_obj_path : str
        path to generated object file

    in_src_path : str
        path to source file

    toolchain_prefix : str
        toolchain prefix to be used. For example, a prefix of
        "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as
        the compiler and "riscv64-unknown-elf-ld" is used as the linker,
        etc.

    device_id : str
        unique identifier for the target device

    lib_type : micro.LibType
        whether to compile a MicroTVM runtime or operator library

    options : List[str]
        additional options to pass to GCC
    """
    base_compile_cmd = [
        f"{toolchain_prefix}gcc",
        "-std=c11",
        "-Wall",
        "-Wextra",
        "--pedantic",
        "-c",
        "-O0",
        "-g",
        "-nostartfiles",
        "-nodefaultlibs",
        "-nostdlib",
        "-fdata-sections",
        "-ffunction-sections",
    ]
    if options is not None:
        base_compile_cmd += options

    src_paths = []
    include_paths = find_include_path() + [get_micro_host_driven_dir()]
    tmp_dir = _util.tempdir()
    # we might transform the src path in one of the branches below
    new_in_src_path = in_src_path
    if lib_type == LibType.RUNTIME:
        dev_dir = _get_device_source_dir(device_id)
        dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]")
        # there needs to at least be a utvm_timer.c file
        assert dev_src_paths
        assert "utvm_timer.c" in map(os.path.basename, dev_src_paths)
        src_paths += dev_src_paths
    elif lib_type == LibType.OPERATOR:
        # create a temporary copy of the source, so we can inject the dev lib
        # header without modifying the original.
        temp_src_path = tmp_dir.relpath("temp.c")
        with open(in_src_path, "r") as f:
            src_lines = f.read().splitlines()
        src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"")
        with open(temp_src_path, "w") as f:
            f.write("\n".join(src_lines))
        new_in_src_path = temp_src_path
        base_compile_cmd += ["-c"]
    else:
        raise RuntimeError("unknown lib type")

    src_paths += [new_in_src_path]

    for path in include_paths:
        base_compile_cmd += ["-I", path]

    prereq_obj_paths = []
    for src_path in src_paths:
        curr_obj_path = Path(src_path).with_suffix(".o").name
        assert curr_obj_path not in prereq_obj_paths
        prereq_obj_paths.append(curr_obj_path)
        curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path]
        run_cmd(curr_compile_cmd)

    ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"]
    ld_cmd += prereq_obj_paths
    ld_cmd += ["-o", out_obj_path]
    run_cmd(ld_cmd)
Example #44
0
def _listen_loop(sock, port, rpc_key, tracker_addr, load_library, custom_addr):
    """Listening loop of the server master."""
    def _accept_conn(listen_sock, tracker_conn, ping_period=2):
        """Accept connection from the other places.

        Parameters
        ----------
        listen_sock: Socket
            The socket used by listening process.

        tracker_conn : connnection to tracker
            Tracker connection

        ping_period : float, optional
            ping tracker every k seconds if no connection is accepted.
        """
        old_keyset = set()
        # Report resource to tracker
        if tracker_conn:
            matchkey = base.random_key(rpc_key + ":")
            base.sendjson(
                tracker_conn,
                [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr])
            assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS
        else:
            matchkey = rpc_key

        unmatch_period_count = 0
        unmatch_timeout = 4
        # Wait until we get a valid connection
        while True:
            if tracker_conn:
                trigger = select.select([listen_sock], [], [], ping_period)
                if not listen_sock in trigger[0]:
                    base.sendjson(tracker_conn,
                                  [TrackerCode.GET_PENDING_MATCHKEYS])
                    pending_keys = base.recvjson(tracker_conn)
                    old_keyset.add(matchkey)
                    # if match key not in pending key set
                    # it means the key is acquired by a client but not used.
                    if matchkey not in pending_keys:
                        unmatch_period_count += 1
                    else:
                        unmatch_period_count = 0
                    # regenerate match key if key is acquired but not used for a while
                    if unmatch_period_count * ping_period > unmatch_timeout + ping_period:
                        logger.info(
                            "no incoming connections, regenerate key ...")
                        matchkey = base.random_key(rpc_key + ":", old_keyset)
                        base.sendjson(tracker_conn, [
                            TrackerCode.PUT, rpc_key,
                            (port, matchkey), custom_addr
                        ])
                        assert base.recvjson(
                            tracker_conn) == TrackerCode.SUCCESS
                        unmatch_period_count = 0
                    continue
            conn, addr = listen_sock.accept()
            magic = struct.unpack("<i", base.recvall(conn, 4))[0]
            if magic != base.RPC_MAGIC:
                conn.close()
                continue
            keylen = struct.unpack("<i", base.recvall(conn, 4))[0]
            key = py_str(base.recvall(conn, keylen))
            arr = key.split()
            expect_header = "client:" + matchkey
            server_key = "server:" + rpc_key
            if arr[0] != expect_header:
                conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH))
                conn.close()
                logger.warning("mismatch key from %s", addr)
                continue
            conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS))
            conn.sendall(struct.pack("<i", len(server_key)))
            conn.sendall(server_key.encode("utf-8"))
            return conn, addr, _parse_server_opt(arr[1:])

    # Server logic
    tracker_conn = None
    while True:
        try:
            # step 1: setup tracker and report to tracker
            if tracker_addr and tracker_conn is None:
                tracker_conn = base.connect_with_retry(tracker_addr)
                tracker_conn.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC))
                magic = struct.unpack("<i", base.recvall(tracker_conn, 4))[0]
                if magic != base.RPC_TRACKER_MAGIC:
                    raise RuntimeError("%s is not RPC Tracker" %
                                       str(tracker_addr))
                # report status of current queue
                cinfo = {"key": "server:" + rpc_key}
                base.sendjson(tracker_conn, [TrackerCode.UPDATE_INFO, cinfo])
                assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS

            # step 2: wait for in-coming connections
            conn, addr, opts = _accept_conn(sock, tracker_conn)
        except (socket.error, IOError):
            # retry when tracker is dropped
            if tracker_conn:
                tracker_conn.close()
                tracker_conn = None
            continue
        except RuntimeError as exc:
            raise exc

        # step 3: serving
        work_path = util.tempdir()
        logger.info("connection from %s", addr)
        server_proc = multiprocessing.Process(target=_serve_loop,
                                              args=(conn, addr, load_library,
                                                    work_path))
        server_proc.deamon = True
        server_proc.start()
        # close from our side.
        conn.close()
        # wait until server process finish or timeout
        server_proc.join(opts.get("timeout", None))
        if server_proc.is_alive():
            logger.info("Timeout in RPC session, kill..")
            # pylint: disable=import-outside-toplevel
            import psutil
            parent = psutil.Process(server_proc.pid)
            # terminate worker childs
            for child in parent.children(recursive=True):
                child.terminate()
            # terminate the worker
            server_proc.terminate()
        work_path.remove()
Example #45
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--host',
                        type=str,
                        required=True,
                        help="The host address of your Raspberry Pi.")
    parser.add_argument('--port',
                        type=int,
                        required=True,
                        help="The port number of your Raspberry Pi.")
    parser.add_argument('--opt-level',
                        type=int,
                        default=1,
                        help="Level of optimization.")
    parser.add_argument('--num-iter',
                        type=int,
                        default=50,
                        help="Number of iteration during benchmark.")
    args = parser.parse_args()

    opt_level = args.opt_level
    target = "llvm -target=armv7l-none-linux-anueabihf -mcpu=cortex-a53 -mattr=+neon"

    num_iter = args.num_iter
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size, ) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(batch_size=1,
                                                       image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.target.rasp():
            graph, lib, params = nnvm.compiler.build(
                net, target, shape={"data": data_shape}, params=params)

    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.o')
    lib.save(lib_fname)

    remote = rpc.connect(args.host, args.port)
    remote.upload(lib_fname)

    ctx = remote.cpu(0)
    rlib = remote.load_module('net.o')
    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

    module = runtime.create(graph, rlib, ctx)
    module.set_input(
        'data',
        tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32")))
    module.set_input(**rparams)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(3):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding cpu overheat
        time.sleep(45)
Example #46
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, input_shape, out_shape = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_graph(net,
                                            target=target,
                                            target_host=target_host,
                                            shape={'data': input_shape},
                                            dtype=dtype,
                                            symbols=(nnvm.sym.conv2d,
                                                     nnvm.sym.dense))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net,
                target=target,
                target_host=target_host,
                shape={'data': input_shape},
                params=params,
                dtype=dtype)

        # export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key,
                                                'localhost',
                                                9190,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
        data_tvm = tvm.nd.array(
            (np.random.uniform(size=input_shape)).astype(dtype))
        module = runtime.create(graph, rlib, ctx)
        module.set_input('data', data_tvm)
        module.set_input(**rparams)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=50, repeat=3)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
def test_gemm_gpu(N, times, bn, num_block, num_thread):
    assert (bn <= N)
    assert (num_thread * num_thread * 16 <= N)
    assert (num_block * num_block * 2 <= N)
    A = te.placeholder((N, N), name='A')
    B = te.placeholder((N, N), name='Btmp')
    k = te.reduce_axis((0, N), name='k')

    packedB = te.compute((N, N / bn, bn),
                         lambda x, y, z: B[x, y * bn + z],
                         name='B')

    C = te.compute(
        (N, N),
        lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k),
        name='C')

    s = te.create_schedule(C.op)
    CC = s.cache_write(C, "local")

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")

    thread_xz = te.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = te.thread_axis((0, 2), "vthread", name="vy")

    pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread)
    pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread)
    s[packedB].bind(pby, thread_y)
    s[packedB].bind(pbx, thread_x)
    pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8)
    s[packedB].vectorize(pbk)

    by, yi = s[C].split(C.op.axis[0], nparts=num_block)
    bx, xi = s[C].split(C.op.axis[1], nparts=num_thread)

    s[C].bind(by, block_y)
    s[C].bind(bx, thread_y)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_block)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)

    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)

    s[C].bind(ty, block_x)
    s[C].bind(tx, thread_x)

    xyi, xxi = s[C].split(xi, factor=8)
    s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi)
    s[C].vectorize(xxi)

    s[CC].compute_at(s[C], yi)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)
    xo, xi = s[CC].split(xo, factor=8)
    s[CC].vectorize(xi)

    ko, ki = s[CC].split(k, factor=2)
    s[CC].unroll(ki)

    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
    temp = util.tempdir()
    path_dso = temp.relpath("gemm_gpu.so")
    f.export_library(path_dso, ndk.create_shared)

    # connect to the proxy
    remote = rpc.connect(proxy_host, proxy_port, key=key)
    ctx = remote.cl(0)
    remote.upload(path_dso)
    f = remote.load_module("gemm_gpu.so")

    evaluate(f, ctx, N, times)
def test_mobilenet():
    temp = util.tempdir()
    image, synset = prepare_input()
    model, params = get_model("mobilenetv2_1.0", image.shape)

    def run(mod, target):
        with relay.build_config(opt_level=3):
            lib = relay.build(mod,
                              target=target,
                              target_host=target_host,
                              params=params)
        path_dso = temp.relpath("deploy.dylib")
        lib.export_library(path_dso, xcode.create_dylib, arch=arch, sdk=sdk)
        xcode.codesign(path_dso)

        # Start RPC test server that contains the compiled library.
        xcode.popen_test_rpc(proxy_host,
                             proxy_port,
                             key,
                             destination=destination,
                             libs=[path_dso])

        # connect to the proxy
        remote = rpc.connect(proxy_host, proxy_port, key=key)

        if target == "metal":
            ctx = remote.metal(0)
        else:
            ctx = remote.cpu(0)
        lib = remote.load_module("deploy.dylib")
        m = graph_runtime.GraphModule(lib["default"](ctx))

        m.set_input("data", tvm.nd.array(image, ctx))
        m.run()
        tvm_output = m.get_output(0)
        top1 = np.argmax(tvm_output.asnumpy()[0])
        print("TVM prediction top-1:", top1, synset[top1])

        # evaluate
        ftimer = m.module.time_evaluator("run", ctx, number=3, repeat=10)
        prof_res = np.array(ftimer().results) * 1000
        print("%-19s (%s)" %
              ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))

    def annotate(func, compiler):
        """
        An annotator for Core ML.
        """
        # Bind free variables to the constant values.
        bind_dict = {}
        for arg in func.params:
            name = arg.name_hint
            if name in params:
                bind_dict[arg] = relay.const(params[name])

        func = relay.bind(func, bind_dict)

        # Annotate the entire graph for Core ML
        mod = tvm.IRModule()
        mod["main"] = func

        seq = tvm.transform.Sequential([
            transform.SimplifyInference(),
            transform.FoldConstant(),
            transform.FoldScaleAxis(),
            transform.AnnotateTarget(compiler),
            transform.MergeCompilerRegions(),
            transform.PartitionGraph(),
        ])

        with relay.build_config(opt_level=3):
            mod = seq(mod)

        return mod

    # CPU
    run(model, target_host)
    # Metal
    run(model, "metal")
    # CoreML
    run(annotate(model, "coremlcompiler"), target_host)
Example #49
0
        def check_alu(tvm_op, np_op=None, use_imm=False):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                name="a",
                                dtype=env.acc_dtype)
            a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: a(*i), "a_buf")  #DRAM->SRAM
            if use_imm:
                res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                      lambda *i: tvm_op(a_buf(*i), imm),
                                      "res_buf")  #compute
            else:
                b = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                    name="b",
                                    dtype=env.acc_dtype)
                b_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                    lambda *i: b(*i), "b_buf")  #DRAM->SRAM
                res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                      lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                                      "res_buf")  #compute5B
            res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: res_buf(*i).astype(env.inp_dtype),
                              "res")  #SRAM->DRAM
            # schedule
            s = tvm.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                else:
                    mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            ctx = remote.ext_dev(0)
            a_np = np.random.randint(-16,
                                     16,
                                     size=(m, n, env.BATCH,
                                           env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16,
                                         16,
                                         size=(m, n, env.BATCH,
                                               env.BLOCK_OUT)).astype(b.dtype)
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, ctx)
            res_nd = tvm.nd.array(
                np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype),
                ctx)

            if env.TARGET == "tsim":
                simulator.tsim_init("libvta_hw")

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, ctx)
                f(a_nd, b_nd, res_nd)
            np.testing.assert_equal(res_np, res_nd.asnumpy())
Example #50
0
def run_conv2d_transpose(env,
                         remote,
                         wl,
                         target,
                         check_correctness=True,
                         print_ir=False,
                         samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)

    # Derive shapes depending upon packing

    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT,
                        wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel,
                        env.BLOCK_OUT, env.BLOCK_IN)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = topi.nn.conv2d_transpose_nchw(data, kernel,
                                            (wl.hstride, wl.wstride),
                                            (wl.hpad, wl.wpad), env.acc_dtype)
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = topi.generic.schedule_conv2d_transpose_nchw([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel
    fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act and wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min,
                                 w_max,
                                 size=(wl.in_filter, wl.out_filter, wl.hkernel,
                                       wl.wkernel)).astype(kernel.dtype)
        r_np = topi.testing.conv2d_transpose_nchw_python(
            a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH,
                                  wl.in_filter // env.BLOCK_IN, env.BLOCK_IN,
                                  wl.height, wl.width).transpose(
                                      (0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(wl.in_filter // env.BLOCK_IN,
                                      env.BLOCK_IN,
                                      wl.out_filter // env.BLOCK_OUT,
                                      env.BLOCK_OUT, wl.hkernel,
                                      wl.wkernel).transpose((2, 0, 4, 5, 3, 1))
        kernel_np = np.flip(kernel_np, 2)
        kernel_np = np.flip(kernel_np, 3)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d_transpose")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d_transpose")
    temp = util.tempdir()
    mod.save(temp.relpath("conv2d_transpose.o"))
    remote.upload(temp.relpath("conv2d_transpose.o"))
    f = remote.load_module("conv2d_transpose.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("conv2d_transpose", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
Example #51
0
def tune_and_evaluate(tuning_opt):

    if env.TARGET != "sim":
        # Get remote from fleet node
        remote = autotvm.measure.request_remote(env.TARGET,
                                                tracker_host,
                                                tracker_port,
                                                timeout=10000)
        # Reconfigure the JIT runtime and FPGA.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
    else:
        # In simulation mode, host the RPC server locally.
        remote = rpc.LocalSession()

    # Register VTA tuning tasks
    register_vta_tuning_tasks()

    # Perform task extraction on Relay program
    print("Extract tasks...")
    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
    mod = tvm.IRModule.from_expr(relay_prog)
    tasks = autotvm.task.extract_from_program(mod,
                                              params=params,
                                              ops=(relay.op.get("nn.conv2d"),),
                                              target=target,
                                              target_host=env.target_host)

    # filter out non-packed conv2d task
    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))

    # We should have extracted 10 convolution tasks
    assert len(tasks) == 10
    print("Extracted {} conv2d tasks:".format(len(tasks)))
    for tsk in tasks:
        inp = tsk.args[0][1]
        wgt = tsk.args[1][1]
        batch = inp[0] * inp[4]
        in_filter = inp[1] * inp[5]
        out_filter = wgt[0] * wgt[4]
        height, width = inp[2], inp[3]
        hkernel, wkernel = wgt[2], wgt[3]
        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
        print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
            batch, height, width, in_filter, out_filter, hkernel, wkernel,
            hpad, wpad, hstride, wstride))

    # We do not run the tuning in our webpage server since it takes too long.
    # Comment the following line to run it by yourself.
    return

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[log_file]):
        # Compile network
        print("Compile...")
        if target.device_name != "vta":
            with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(relay_prog,
                                                target=target,
                                                params=params,
                                                target_host=env.target_host)
        else:
            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog,
                    target=target,
                    params=params,
                    target_host=env.target_host)

        # Export library
        print("Upload...")
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Generate the graph runtime
        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
        m = graph_runtime.create(graph, lib, ctx)

        # upload parameters to device
        image = tvm.nd.array(
            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
        m.set_input(**params)
        m.set_input('data', image)

        # evaluate
        print("Evaluate inference time cost...")
        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
        tcost = timer()
        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Example #52
0
def deploy_rpc():
    """Runs the demo that deploys a model remotely through RPC.
    """
    from tvm import rpc
    from tvm.contrib import util, emscripten

    # As usual, load the resnet18 model.
    net, params, data_shape, out_shape = load_mxnet_resnet()

    # Compile the model.
    # Note that this time we are changing the target.
    # This is because we want to translate the host library into JavaScript
    # through Emscripten.
    graph, lib, params = compile_net(
        net,
        target_host="llvm -target=asmjs-unknown-emscripten -system-lib",
        target="opengl",
        data_shape=data_shape,
        params=params)

    # Now we want to deploy our model through RPC.
    # First we ned to prepare the module files locally.
    print("Saving the compiled module...")

    temp = util.tempdir()
    path_obj = temp.relpath("deploy.bc")  # host LLVM part
    path_dso = temp.relpath("deploy.js")  # host JavaScript part
    path_gl = temp.relpath("deploy.gl")  # device GLSL part
    path_json = temp.relpath("deploy.tvm_meta.json")

    lib.save(path_obj)
    emscripten.create_js(path_dso, path_obj, side_module=True)
    lib.imported_modules[0].save(path_gl)

    print("- Saved files:", temp.listdir())

    # Connect to the RPC server.
    print("Connecting to RPC server...")
    proxy_host = 'localhost'
    proxy_port = 9090
    remote = rpc.connect(proxy_host, proxy_port, key="js")
    print("- Connected to RPC server!")

    # Upload module to RPC server.
    print("Uploading module to RPC server...")
    remote.upload(path_dso, "deploy.dso")
    remote.upload(path_gl)
    remote.upload(path_json)
    print("- Upload completed!")

    # Load remote library.
    print("Loading remote library...")
    fdev = remote.load_module("deploy.gl")
    fhost = remote.load_module("deploy.dso")
    fhost.import_module(fdev)
    rlib = fhost
    print("- Remote library loaded!")

    ctx = remote.opengl(0)

    # Upload the parameters.
    print("Uploading parameters...")
    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
    print("- Parameters uploaded!")

    # Create the remote runtime module.
    print("Running remote module...")
    from tvm.contrib import graph_runtime
    module = graph_runtime.create(graph, rlib, ctx)

    # Set parameter.
    module.set_input(**rparams)

    # Set input data.
    input_data = np.random.uniform(size=data_shape)
    module.set_input('data', tvm.nd.array(input_data.astype('float32')))

    # Run.
    module.run()
    print("- Remote module execution completed!")

    out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
    # Print first 10 elements of output.
    print(out.asnumpy()[0][0:10])
Example #53
0
        # Compile network
        print("Compiling network with best tuning parameters...")
        if target.device_name != "vta":
            with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog, target=target, params=params, target_host=env.target_host
                )
        else:
            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog, target=target, params=params, target_host=env.target_host
                )

        # Export library
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # If detailed runtime info is needed build with debug runtime
        if opt.debug_profile:
            m = debug_runtime.create(graph, lib, ctx)
        else:
            m = graph_runtime.create(graph, lib, ctx)

        # Set the network parameters and synthetic input
        image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
        m.set_input(**params)
        m.set_input("data", image)
    def compile_model(self):
        if device == 'vta':
            self.remote = rpc.connect(self.pynq_addr, 9091)
            vta.reconfig_runtime(self.remote)
            vta.program_fpga(self.remote, bitstream=None)
        else:
            self.remote = rpc.LocalSession()

        self.ctx = self.remote.ext_dev(
            0) if device == 'vta' else self.remote.cpu(0)

        # Load pre-configured AutoTVM schedules
        with autotvm.tophub.context(target):

            # Populate the shape and data type dictionary for ResNet input
            dtype_dict = {'data': 'float32'}
            shape_dict = {'data': (env.BATCH, 3, 224, 224)}

            gluon_model = vision.resnet18_v1(
                pretrained=True, ctx=ctx
            ).features if args.nonsplit else splitnet.resnet18_v1_split(
                self.id + 1)

            # Measure build start time
            build_start = time.time()

            # Start front end compilation
            mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

            # Update shape and type dictionary
            shape_dict.update({k: v.shape for k, v in params.items()})
            dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

            # Perform quantization in Relay
            with relay.quantize.qconfig(global_scale=8.0,
                                        skip_conv_layers=[0]):
                relay_prog = relay.quantize.quantize(mod['main'],
                                                     params=params)

            # Perform graph packing and constant folding for VTA target
            if target.device_name == 'vta':
                assert env.BLOCK_IN == env.BLOCK_OUT
                relay_prog = graph_pack(relay_prog,
                                        env.BATCH,
                                        env.BLOCK_OUT,
                                        env.WGT_WIDTH,
                                        start_name=start_pack,
                                        stop_name=stop_pack)

            # Compile Relay program with AlterOpLayout disabled
            with relay.build_config(opt_level=3,
                                    disabled_pass={'AlterOpLayout'}):
                if target.device_name != 'vta':
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)
                else:
                    with vta.build_config():
                        graph, lib, params = relay.build(
                            relay_prog,
                            target=target,
                            params=params,
                            target_host=env.target_host)

            self.params = params

            # Measure Relay build time
            build_time = time.time() - build_start
            print(f'inference graph for thread {self.id} built in {0:.4f}s!'.
                  format(build_time))

            # Send the inference library over to the remote RPC server
            temp = util.tempdir()
            lib.save(temp.relpath('graphlib.o'))
            self.remote.upload(temp.relpath('graphlib.o'))
            lib = self.remote.load_module('graphlib.o')

            # Graph runtime
            self.m = graph_runtime.create(graph, lib, self.ctx)
Example #55
0
def main():
    # extract workloads from relay program
    input_shape = (1, 3, 224, 224)
    print("Extrack tasks...")
    mod, params = get_workload(image_shape=input_shape[1:],
                               batch_size=input_shape[0])

    tasks = autotvm.task.extract_from_program(mod["main"],
                                              target=target,
                                              target_host=target_host,
                                              params=params,
                                              ops=(
                                                  relay.op.nn.conv2d,
                                                  relay.op.nn.dense,
                                              ))

    # run tuning tasks
    print("Tuning...")

    tune_tasks(tasks, **tuning_option)

    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=0):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params, target_host=target_host)

            tmp = tempdir()
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

            remote = autotvm.measure.request_remote(device_key,
                                                    '0.0.0.0',
                                                    9192,
                                                    timeout=10000)
            remote.upload(tmp.relpath(filename))
            rlib = remote.load_module(filename)

            ctx = remote.context(str(target), 0)
            module = runtime.create(graph, rlib, ctx)
            data_tvm = tvm.nd.array(
                (np.random.uniform(size=input_shape)).astype(dtype))

            print("Run...")
            print("Set_input(\"data\")")
            module.set_input('data', data_tvm)
            print("Set_input(**param)")
            module.set_input(**params)

            #evaluate
            print("Evaluate inference time cost...")
            ftimer = module.module.time_evaluator("run",
                                                  ctx,
                                                  number=1,
                                                  repeat=600)
            prof_res = np.array(ftimer().results) * 1000
            #print(ftimer().results)
            tmp = sorted(ftimer().results)
            print(tmp[0])
            print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
                  (np.mean(prof_res), np.std(prof_res)))
Example #56
0
def generate_graph(graph_fn, params_fn, device="vta"):

    # Measure build start time
    build_start = time.time()

    # Derive the TVM target
    target = tvm.target.create("llvm -device={}".format(device))

    # Derive the LLVM compiler flags
    # When targetting the Pynq, cross-compile to ARMv7 ISA
    if env.TARGET == "sim":
        target_host = "llvm"
    elif env.TARGET == "pynq":
        target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon"

    # Load the ResNet-18 graph and parameters
    sym = nnvm.graph.load_json(open(graph_fn).read())
    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())

    # Populate the shape and data type dictionary
    shape_dict = {"data": (1, 3, 224, 224)}
    dtype_dict = {"data": 'float32'}
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Create NNVM graph
    graph = nnvm.graph.create(sym)
    graph_attr.set_shape_inputs(sym, shape_dict)
    graph_attr.set_dtype_inputs(sym, dtype_dict)
    graph = graph.apply("InferShape").apply("InferType")

    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
    if target.device_name == "vta":
        assert env.BLOCK_IN == env.BLOCK_OUT
        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)

    # Compile NNVM graph
    with nnvm.compiler.build_config(opt_level=3):
        if target.device_name != "vta":
            graph, lib, params = nnvm.compiler.build(
                sym, target, shape_dict, dtype_dict,
                params=params, target_host=target_host)
        else:
            with vta.build_config():
                graph, lib, params = nnvm.compiler.build(
                    sym, target, shape_dict, dtype_dict,
                    params=params, target_host=target_host)

    # Save the compiled inference graph library
    assert tvm.module.enabled("rpc")
    temp = util.tempdir()
    lib.save(temp.relpath("graphlib.o"))

    # Send the inference library over to the remote RPC server
    remote.upload(temp.relpath("graphlib.o"))
    lib = remote.load_module("graphlib.o")

    # Measure build time
    build_time = time.time() - build_start
    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))

    return graph, lib, params
Example #57
0
 def check_local(coreml_model):
     temp = util.tempdir()
     compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir)
     ctx = tvm.cpu(0)
     verify(coreml_model, compiled_model, ctx)
Example #58
0
 def __init__(self, model_dir):
     self._tmp_dir = util.tempdir()
     self._model_dir = model_dir
     self._graph = graph_pb2.GraphDef()
Example #59
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from shutil import which
import json
import pytest
import sys
import numpy as np

import tvm
import tvm.runtime._ffi_api
from tvm import relay
from tvm.contrib import util

tmp_path = util.tempdir()


def generate_csource_module():
    """Mock the codegen with an external library (e.g., CBLAS/cuDNN)"""

    code = r'''
    #include <tvm/runtime/c_runtime_api.h>
    #include <tvm/runtime/packed_func.h>
    #include <dlpack/dlpack.h>
    #include <cstdint>
    #include <cstring>
    #include <iostream>

    #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
      extern "C" void p_ID_(float* a, float* b, float* out) { \
Example #60
0
    def export_library(self, file_name, fcompile=None, **kwargs):
        """Export the module and its imported device code one library.

        This function only works on host llvm modules.
        It will pack all the imported modules

        Parameters
        ----------
        file_name : str
            The name of the shared library.

        fcompile : function(target, file_list, kwargs), optional
            Compilation function to use create dynamic library.
            If fcompile has attribute object_format, will compile host library
            to that format. Otherwise, will use default format "o".

        kwargs : dict, optional
            Additional arguments passed to fcompile
        """
        # NOTE: this function depends on contrib library features
        # which are only available in when TVM function is available.
        if _RUNTIME_ONLY:
            raise RuntimeError(
                "Cannot call export_library in runtime only mode")
        # Extra dependencies during runtime.
        from pathlib import Path
        from tvm.contrib import cc as _cc, tar as _tar, util as _util

        if isinstance(file_name, Path):
            file_name = str(file_name)

        if self.type_key == "stackvm":
            if not file_name.endswith(".stackvm"):
                raise ValueError(
                    "Module[%s]: can only be saved as stackvm format."
                    "did you build with LLVM enabled?" % self.type_key)
            self.save(file_name)
            return

        modules = self._collect_dso_modules()
        temp = _util.tempdir()
        files = []
        is_system_lib = False
        has_c_module = False
        llvm_target_triple = None
        for index, module in enumerate(modules):
            if fcompile is not None and hasattr(fcompile, "object_format"):
                object_format = fcompile.object_format
            else:
                if module.type_key == "llvm":
                    object_format = "o"
                else:
                    assert module.type_key == "c"
                    object_format = "cc"
                    has_c_module = True
            path_obj = temp.relpath("lib" + str(index) + "." + object_format)
            module.save(path_obj)
            files.append(path_obj)
            is_system_lib = (module.type_key == "llvm" and
                             module.get_function("__tvm_is_system_module")())
            llvm_target_triple = (module.type_key == "llvm" and
                                  module.get_function("_get_target_triple")())
        if not fcompile:
            if file_name.endswith(".tar"):
                fcompile = _tar.tar
            else:
                fcompile = _cc.create_shared

        if llvm_target_triple is None and hasattr(fcompile,
                                                  "get_target_triple"):
            llvm_target_triple = fcompile.get_target_triple()

        if self.imported_modules:
            if enabled("llvm") and llvm_target_triple:
                path_obj = temp.relpath("devc.o")
                m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib,
                                                     llvm_target_triple)
                m.save(path_obj)
                files.append(path_obj)
            else:
                path_cc = temp.relpath("devc.cc")
                with open(path_cc, "w") as f:
                    f.write(_ffi_api.ModulePackImportsToC(self, is_system_lib))
                files.append(path_cc)

        if has_c_module:
            options = []
            if "options" in kwargs:
                opts = kwargs["options"]
                options = opts if isinstance(opts, (list, tuple)) else [opts]
            opts = options + ["-I" + path for path in find_include_path()]
            kwargs.update({'options': opts})

        fcompile(file_name, files, **kwargs)