Esempio n. 1
0
def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
    """Test AutoTune for microTVM Zephyr"""
    if board != "qemu_x86":
        pytest.xfail(f"Autotune fails on {board}.")

    runtime = Runtime("crt", {"system-lib": True})
    model = test_utils.ZEPHYR_BOARDS[board]
    build_config = {"debug": tvm_debug}

    # Create a Relay model
    data_shape = (1, 3, 16, 16)
    weight_shape = (8, 3, 5, 5)
    data = relay.var("data", relay.TensorType(data_shape, "float32"))
    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        kernel_layout="OIHW",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    mod = tvm.IRModule.from_expr(f)
    mod = relay.transform.InferType()(mod)

    data_sample = np.random.rand(data_shape[0], data_shape[1], data_shape[2],
                                 data_shape[3]).astype("float32")
    weight_sample = np.random.rand(weight_shape[0], weight_shape[1],
                                   weight_shape[2],
                                   weight_shape[3]).astype("float32")
    params = {mod["main"].params[1].name_hint: weight_sample}

    target = tvm.target.target.micro(model)
    pass_context = tvm.transform.PassContext(
        opt_level=3, config={"tir.disable_vectorize": True})
    with pass_context:
        tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target)
    assert len(tasks) > 0

    config_main_stack_size = None
    if test_utils.qemu_boards(board):
        config_main_stack_size = 1536

    project_options = {
        "zephyr_board": board,
        "west_cmd": west_cmd,
        "verbose": 1,
        "project_type": "host_driven",
    }
    if config_main_stack_size is not None:
        project_options["config_main_stack_size"] = config_main_stack_size

    module_loader = tvm.micro.AutoTvmModuleLoader(
        template_project_dir=test_utils.TEMPLATE_PROJECT_DIR,
        project_options=project_options,
    )

    timeout = 200
    builder = tvm.autotvm.LocalBuilder(
        timeout=timeout,
        n_parallel=1,
        build_kwargs={"build_option": {
            "tir.disable_vectorize": True
        }},
        do_fork=True,
        build_func=tvm.micro.autotvm_build_func,
        runtime=runtime,
    )
    runner = tvm.autotvm.LocalRunner(number=1,
                                     repeat=1,
                                     timeout=timeout,
                                     module_loader=module_loader)

    measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)

    log_path = pathlib.Path("zephyr_autotune.log")
    if log_path.exists():
        log_path.unlink()

    n_trial = 10
    for task in tasks:
        tuner = tvm.autotvm.tuner.GATuner(task)
        tuner.tune(
            n_trial=n_trial,
            measure_option=measure_option,
            callbacks=[
                tvm.autotvm.callback.log_to_file(str(log_path)),
                tvm.autotvm.callback.progress_bar(n_trial, si_prefix="M"),
            ],
            si_prefix="M",
        )
        assert tuner.best_flops > 0

    check_tune_log(log_path)

    # Build without tuning
    with pass_context:
        lowered = tvm.relay.build(mod,
                                  target=target,
                                  runtime=runtime,
                                  params=params)

    temp_dir = utils.tempdir()
    with _make_session(temp_dir, board, west_cmd, lowered,
                       build_config) as session:
        graph_mod = tvm.micro.create_local_graph_executor(
            lowered.get_graph_json(), session.get_system_lib(), session.device)
        graph_mod.set_input(**lowered.get_params())
        graph_mod.run(data=data_sample)
        expected_output = graph_mod.get_output(0).numpy()
        del graph_mod

    # Build using autotune logs
    with tvm.autotvm.apply_history_best(str(log_path)):
        with pass_context:
            lowered_tuned = tvm.relay.build(mod,
                                            target=target,
                                            runtime=runtime,
                                            params=params)

    temp_dir = utils.tempdir()
    with _make_session(temp_dir, board, west_cmd, lowered_tuned,
                       build_config) as session:
        graph_mod = tvm.micro.create_local_graph_executor(
            lowered_tuned.get_graph_json(), session.get_system_lib(),
            session.device)
        graph_mod.set_input(**lowered_tuned.get_params())
        graph_mod.run(data=data_sample)
        output = graph_mod.get_output(0).numpy()
        del graph_mod

    tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
Esempio n. 2
0
def compile_and_run(mod,
                    input_list,
                    output_list,
                    use_calculated_workspaces,
                    params=None,
                    workspace_byte_alignment=8):
    """
    This method verifies the generated source
    """
    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}"
    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "

    # The calculated workspaces will not account for stack allocator tags used for debugging
    if not use_calculated_workspaces:
        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

    with tvm.transform.PassContext(opt_level=3,
                                   config={"tir.disable_vectorize": True}):
        lib = tvm.relay.build(mod, target, target_host=target, params=params)

    tmp_path = utils.tempdir()
    tmp_dir = tmp_path.temp_dir

    base_path = os.path.join(tmp_dir, "test")
    build_path = os.path.join(base_path, "build")
    os.makedirs(build_path, exist_ok=True)

    tar_file = os.path.join(base_path, "test.tar")
    export_model_library_format(lib, tar_file)
    t = tarfile.open(tar_file)
    t.extractall(base_path)
    if use_calculated_workspaces:
        workspace_bytes = extract_main_workspace_sizebytes(base_path)
    else:
        workspace_bytes = 16384 * 1024

    for i in range(len(input_list)):
        create_header_file((f"input_data{i}"), input_list[i], build_path)

    for i in range(len(output_list)):
        create_header_file(
            (f"output_data{i}"),
            np.zeros(output_list[i].shape, output_list[i].dtype),
            build_path,
        )
        create_header_file((f"expected_output_data{i}"), output_list[i],
                           build_path)

    create_main("test.c", input_list, output_list, build_path, workspace_bytes)

    # Verify that compiles fine
    file_dir = os.path.dirname(os.path.abspath(__file__))
    makefile = os.path.join(file_dir, "aot_test.mk")
    make_cmd = (f"make CFLAGS='{cflags}' -f {makefile} build_dir=" +
                build_path + f" TVM_ROOT={file_dir}/../../../..")

    compile_log_path = os.path.join(build_path, "test_compile.log")
    ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path,
                                         False)
    assert ret == 0

    # Verify that runs fine
    run_log_path = os.path.join(build_path, "test_run.log")
    ret = subprocess_with_stdout_and_log("./aot_test_runner", build_path,
                                         run_log_path, False)
    assert ret == 0
Esempio n. 3
0
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from tvm.topi.testing import conv2d_nchw_python

# Compile the TVM module
with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
    my_conv = vta.build(s, [data, kernel, res],
                        tvm.target.Target("ext_dev", host=env.target_host),
                        name="my_conv")
temp = utils.tempdir()
my_conv.save(temp.relpath("conv2d.o"))
remote.upload(temp.relpath("conv2d.o"))
f = remote.load_module("conv2d.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
data_np = np.random.randint(-128,
                            128,
                            size=(batch_size, in_channels, height,
                                  width)).astype(data.dtype)
kernel_np = np.random.randint(-128,
                              128,
def run_conv2d(env,
               remote,
               wl,
               target,
               check_correctness=True,
               print_ir=False,
               samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack
        conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        conv2d_fcompute = vta.top.conv2d_packed
        conv2d_fschedule = vta.top.schedule_conv2d_packed

    # Derive shapes depending upon packing
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (
            wl.batch // env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            wl.height,
            wl.width,
            env.BATCH,
            env.BLOCK_IN,
        )
        kernel_shape = (
            wl.out_filter // env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        bias_shape = (
            wl.batch // env.BATCH,
            wl.out_filter // env.BLOCK_OUT,
            1,
            1,
            env.BATCH,
            env.BLOCK_OUT,
        )
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        if data_pack:
            res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride),
                                  padding, (1, 1), layout, env.acc_dtype)
        else:
            res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride),
                                  padding, (1, 1), env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = conv2d_fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = (2 * wl.batch * fout_height * fout_width * wl.hkernel *
               wl.wkernel * wl.out_filter * wl.in_filter)

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        r_np = tvm.topi.testing.conv2d_nchw_python(
            a_np.astype(env.acc_dtype),
            w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride),
            wl.hpad,
        ).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            wl.batch // env.BATCH,
            env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.height,
            wl.width,
        ).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.out_filter // env.BLOCK_OUT,
            env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
        ).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(wl.batch // env.BATCH,
                                  wl.out_filter // env.BLOCK_OUT, 1, 1,
                                  env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    else:
        mod = tvm.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    temp = utils.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    bias_arr = tvm.nd.array(bias_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("conv2d", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
            bias_np = bias_np.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
def test_export_model_library_format_llvm():
    with utils.TempDirectory.set_keep_for_debug(True):
        target = tvm.target.target.micro("host")
        assert str(target)[:2] == "c "
        target = tvm.target.Target("llvm " + str(target)[2:])
        with tvm.transform.PassContext(opt_level=3):
            relay_mod = tvm.parser.fromtext(
                """
            #[version = "0.0.5"]
            def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[(1, 2), float32]) {
            %0 = cast(%a, dtype="float32") + %b * %c;
            %0
            }"""
            )
            factory = tvm.relay.build(
                relay_mod,
                target,
                target_host=target,
                mod_name="add",
                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
            )

        temp_dir = utils.tempdir()
        mlf_tar_path = temp_dir.relpath("lib.tar")
        import tvm.micro as micro

        micro.export_model_library_format(factory, mlf_tar_path)
        tf = tarfile.open(mlf_tar_path)

        extract_dir = temp_dir.relpath("extract")
        os.mkdir(extract_dir)
        tf.extractall(extract_dir)

        with open(os.path.join(extract_dir, "metadata.json")) as json_f:
            metadata = json.load(json_f)
            assert metadata["version"] == 5
            assert metadata["model_name"] == "add"
            export_datetime = datetime.datetime.strptime(
                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
            )
            assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
            assert metadata["target"] == {"1": str(target)}
            assert metadata["memory"]["sids"] == [
                {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                {"storage_id": 3, "size_bytes": 8},
            ]
            assert metadata["memory"]["functions"]["main"] == [
                {
                    "constants_size_bytes": 8,
                    "device": 1,
                    "io_size_bytes": 18,
                    "workspace_size_bytes": 0,
                }
            ]
            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
                {"device": 1, "workspace_size_bytes": 0}
            ]
            assert (
                "fused_cast_multiply_add"
                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
            )

        assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "add_lib0.o"))

        validate_graph_json(extract_dir, factory)

        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
            assert relay_f.read() == str(relay_mod)

        with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
            params = tvm.relay.load_param_dict(params_f.read())
            assert "p0" in params
Esempio n. 6
0
def tune_and_evaluate(tuning_opt):

    if env.TARGET != "sim":
        # Get remote from fleet node
        remote = autotvm.measure.request_remote(
            env.TARGET, tracker_host, tracker_port, timeout=10000
        )
        # Reconfigure the JIT runtime and FPGA.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
    else:
        # In simulation mode, host the RPC server locally.
        remote = rpc.LocalSession()

    # Register VTA tuning tasks
    register_vta_tuning_tasks()

    # Perform task extraction on Relay program
    print("Extract tasks...")
    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
    mod = tvm.IRModule.from_expr(relay_prog)
    tasks = autotvm.task.extract_from_program(
        mod,
        params=params,
        ops=(relay.op.get("nn.conv2d"),),
        target=target,
        target_host=env.target_host,
    )

    # filter out non-packed conv2d task
    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))

    # We should have extracted 10 convolution tasks
    assert len(tasks) == 10
    print("Extracted {} conv2d tasks:".format(len(tasks)))
    for tsk in tasks:
        inp = tsk.args[0][1]
        wgt = tsk.args[1][1]
        batch = inp[0] * inp[4]
        in_filter = inp[1] * inp[5]
        out_filter = wgt[0] * wgt[4]
        height, width = inp[2], inp[3]
        hkernel, wkernel = wgt[2], wgt[3]
        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
        print(
            "({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
                batch,
                height,
                width,
                in_filter,
                out_filter,
                hkernel,
                wkernel,
                hpad,
                wpad,
                hstride,
                wstride,
            )
        )

    # We do not run the tuning in our webpage server since it takes too long.
    # Comment the following line to run it by yourself.
    return

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[log_file]):
        # Compile network
        print("Compile...")
        if target.device_name != "vta":
            with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                lib = relay.build(
                    relay_prog, target=target, params=params, target_host=env.target_host
                )
        else:
            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                lib = relay.build(
                    relay_prog, target=target, params=params, target_host=env.target_host
                )

        # Export library
        print("Upload...")
        temp = utils.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Generate the graph runtime
        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
        m = graph_runtime.GraphModule(lib["default"](ctx))

        # upload parameters to device
        image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
        m.set_input("data", image)

        # evaluate
        print("Evaluate inference time cost...")
        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
        tcost = timer()
        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
        print(
            "Mean inference time (std dev): %.2f ms (%.2f ms)"
            % (np.mean(prof_res), np.std(prof_res))
        )
Esempio n. 7
0
def verify_result(
    mod,
    map_inputs,
    out_shape,
    #result,
    tol=1e-5,
    target="llvm",
    ctx=tvm.cpu(),
    params=None,
    dpu_target="DPUCADX8G",
    tvm_ops=0,
):
    """To check the result between reference and byoc vitis-ai flow"""

    lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops)
    #lib = build_module(mod, target, params=params, tvm_ops=tvm_ops)
    lib.export_library("tvm_lib.so")
    print('-----------------1--------------------')
    #lib = update_lib(lib)
    ctx = tvm.cpu()
    rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu()))

    for name, data in map_inputs.items():
        rt_mod.set_input(name, data)
    rt_mod.set_input(**params)
    
    aa=1
    while 1:
        start = time.time()
        rt_mod.run()
        end = time.time()
        inference_time = np.round(end - start, 2)
        print("Inference time: ", inference_time, " sec")
        aa=aa+1
        if aa>10 :
            break

    out = rt_mod.get_output(0)
    # get top1 result
    top1 = np.argmax(out.asnumpy())
    #print("TVM prediction top-1: {}".format(synset[top1]))
    print("TVM prediction top-1: {}",top1)


    temp = utils.tempdir()
    export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
    #lib.export_library(temp.relpath("tvm_lib.so"))
    #lib.export_library("tvm_lib.so")

    # Export lib for aarch64 target
    #tvm_target = tvm.target.arm_cpu('DPUCZDX8G-zcu104')
    tvm_target = tvm.target.arm_cpu('zcu104')
    #tvm_target = tvm.target.arm_cpu('ultra96')
    lib_kwargs = {
                 'fcompile': contrib.cc.create_shared,
                 'cc': "/usr/aarch64-linux-gnu/bin/ld"
                 }

    with tvm.transform.PassContext(opt_level=3,
             config={'relay.ext.vitis_ai.options.load_runtime_module': export_rt_mod_file}):
             lib_arm = relay.build(mod, target=tvm_target, params=params)
                  #lib_arm = relay.build_module.build(mod, target=tvm_target, params=params)


    #lib_dpuv2.export_library('tvm_dpu_arm.so', **lib_kwargs)
    lib_arm.export_library('tvm_dpu_arm.so', **lib_kwargs)


    print('-----------------2--------------------')
Esempio n. 8
0
def run_gemm(
    env,
    remote,
    target,
    batch_size,
    in_feat,
    out_feat,
    check_correctness=True,
    print_ir=True,
    samples=4,
):

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
    elif "vta" in target.keys:
        data_pack = True

    # Derive shapes depending upon packing
    a_shape = (batch_size, in_feat)
    w_shape = (out_feat, in_feat)
    if data_pack:
        data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        kernel_shape = (
            out_feat // env.BLOCK_OUT,
            in_feat // env.BLOCK_IN,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        fcompute = vta.top.dense_packed
        fschedule = vta.top.schedule_dense_packed
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        fcompute = topi.x86.dense_nopack
        fschedule = topi.x86.schedule_dense_nopack
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, None, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    num_ops = 2 * batch_size * in_feat * out_feat

    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
    def get_ref_data():
        # derive min max for act, wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)

        r_np = np.dot(a_np.astype(env.acc_dtype),
                      w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH,
                                  in_feat // env.BLOCK_IN,
                                  env.BLOCK_IN).transpose((0, 2, 1, 3))
        kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT,
                                      in_feat // env.BLOCK_IN,
                                      env.BLOCK_IN).transpose((0, 2, 1, 3))

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    temp = utils.tempdir()
    mod.save(temp.relpath("dense.o"))
    remote.upload(temp.relpath("dense.o"))
    f = remote.load_module("dense.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("dense", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.reshape(batch_size, out_feat)
        res_ref = res_ref >> 8
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
Esempio n. 9
0
def test_rpc_module():
    # graph
    n = tvm.runtime.convert(1024)
    A = te.placeholder((n, ), name="A")
    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    temp = utils.tempdir()

    # Establish remote connection with target hardware
    tracker = rpc.connect_tracker(tracker_host, tracker_port)
    remote = tracker.request(key, priority=0, session_timeout=60)

    # Compile the Graph for CPU target
    s = te.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].parallel(xi)
    s[B].pragma(xo, "parallel_launch_point")
    s[B].pragma(xi, "parallel_barrier_when_finish")
    f = tvm.build(s, [A, B], target, name="myadd_cpu")
    path_dso_cpu = temp.relpath("cpu_lib.so")
    f.export_library(path_dso_cpu, ndk.create_shared)

    # Execute the portable graph on cpu target
    print("Run CPU test ...")
    dev = remote.cpu(0)
    remote.upload(path_dso_cpu)
    f2 = remote.load_module("cpu_lib.so")
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
    cost = time_f(a, b).mean
    print("%g secs/op\n" % cost)
    np.testing.assert_equal(b.numpy(), a.numpy() + 1)

    # Compile the Graph for OpenCL target
    if test_opencl:
        s = te.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=64)
        s[B].bind(xi, te.thread_axis("threadIdx.x"))
        s[B].bind(xo, te.thread_axis("blockIdx.x"))
        # Build the dynamic lib.
        # If we don't want to do metal and only use cpu, just set target to be target
        f = tvm.build(s, [A, B],
                      tvm.target.Target("opencl", host=target),
                      name="myadd")
        path_dso_cl = temp.relpath("dev_lib_cl.so")
        f.export_library(path_dso_cl, ndk.create_shared)

        print("Run GPU(OpenCL Flavor) test ...")
        dev = remote.cl(0)
        remote.upload(path_dso_cl)
        f1 = remote.load_module("dev_lib_cl.so")
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
        cost = time_f(a, b).mean
        print("%g secs/op\n" % cost)
        np.testing.assert_equal(b.numpy(), a.numpy() + 1)

    # Compile the Graph for Vulkan target
    if test_vulkan:
        s = te.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=64)
        s[B].bind(xi, te.thread_axis("threadIdx.x"))
        s[B].bind(xo, te.thread_axis("blockIdx.x"))
        # Build the dynamic lib.
        # If we don't want to do metal and only use cpu, just set target to be target
        f = tvm.build(s, [A, B],
                      tvm.target.Target("vulkan", host=target),
                      name="myadd")
        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
        f.export_library(path_dso_vulkan, ndk.create_shared)

        print("Run GPU(Vulkan Flavor) test ...")
        dev = remote.vulkan(0)
        remote.upload(path_dso_vulkan)
        f1 = remote.load_module("dev_lib_vulkan.so")
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
        cost = time_f(a, b).mean
        print("%g secs/op\n" % cost)
        np.testing.assert_equal(b.numpy(), a.numpy() + 1)
Esempio n. 10
0
def compile_and_run_multiple_models(mod_map, input_list_map, output_list_map,
                                    target_options, param_map):
    """
    This method verifies the generated source
    """
    target = f"c -runtime=c --link-params --executor=aot {target_options}"
    tmp_path = utils.tempdir()
    tmp_dir = tmp_path.temp_dir

    base_path = os.path.join(tmp_dir, "test")
    build_path = os.path.join(base_path, "build")
    os.makedirs(build_path, exist_ok=True)
    for mod_name, mod in mod_map.items():

        with tvm.transform.PassContext(opt_level=3,
                                       config={"tir.disable_vectorize": True}):
            lib = tvm.relay.build(mod,
                                  target,
                                  target_host=target,
                                  params=param_map[mod_name],
                                  mod_name=mod_name)

        tar_file = os.path.join(base_path, "test.tar")
        export_model_library_format(lib, tar_file)
        t = tarfile.open(tar_file)
        t.extractall(base_path)

        input_list = input_list_map[mod_name]
        output_list = output_list_map[mod_name]

        for i in range(len(input_list_map[mod_name])):
            create_header_file((f'{mangle_name(mod_name,"input_data")}{i}'),
                               input_list[i], build_path)

        for i in range(len(output_list_map[mod_name])):
            create_header_file(
                (f'{mangle_name(mod_name,"output_data")}{i}'),
                np.zeros(output_list[i].shape, output_list[i].dtype),
                build_path,
            )
            create_header_file(
                (f'{mangle_name(mod_name,"expected_output_data")}{i}'),
                output_list[i], build_path)

    create_main("test.c",
                input_list_map,
                output_list_map,
                build_path,
                workspace_bytes=16384 * 1024)

    # Verify that compiles fine
    file_dir = os.path.dirname(os.path.abspath(__file__))
    makefile = os.path.join(file_dir, "aot_test.mk")
    make_cmd = f"make -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../.."

    compile_log_path = os.path.join(build_path, "test_compile.log")
    ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path,
                                         False)
    assert ret == 0

    # Verify that runs fine
    run_log_path = os.path.join(build_path, "test_run.log")
    ret = subprocess_with_stdout_and_log("./aot_test_runner", build_path,
                                         run_log_path, False)
    assert ret == 0
Esempio n. 11
0
    def export_library(self,
                       file_name,
                       fcompile=None,
                       addons=None,
                       workspace_dir=None,
                       **kwargs):
        """Export the module and its imported device code one library.

        This function only works on host llvm modules.
        It will pack all the imported modules

        Parameters
        ----------
        file_name : str
            The name of the shared library.

        fcompile : function(target, file_list, kwargs), optional
            Compilation function to use create dynamic library.
            If fcompile has attribute object_format, will compile host library
            to that format. Otherwise, will use default format "o".

        workspace_dir : str, optional
            the path to a directory used to create intermediary
            artifacts for the process exporting of the library.
            If this is not provided a temporary dir will be created.

        kwargs : dict, optional
            Additional arguments passed to fcompile

        Returns
        -------
        result of fcompile()  : unknown, optional
            If the compilation function returns an artifact it would be returned via
            export_library, if any.
        """
        # NOTE: this function depends on contrib library features
        # which are only available in when TVM function is available.
        if _RUNTIME_ONLY:
            raise RuntimeError(
                "Cannot call export_library in runtime only mode")
        # Extra dependencies during runtime.
        from pathlib import Path
        from tvm.contrib import cc as _cc, tar as _tar, utils as _utils

        if isinstance(file_name, Path):
            file_name = str(file_name)

        if self.type_key == "stackvm":
            if not file_name.endswith(".stackvm"):
                raise ValueError(
                    "Module[%s]: can only be saved as stackvm format."
                    "did you build with LLVM enabled?" % self.type_key)
            self.save(file_name)
            return

        modules = self._collect_dso_modules()
        if workspace_dir is None:
            temp = _utils.tempdir()
            workspace_dir = temp.temp_dir
        files = addons if addons else []
        is_system_lib = False
        has_c_module = False
        llvm_target_triple = None
        for index, module in enumerate(modules):
            if fcompile is not None and hasattr(fcompile, "object_format"):
                if module.type_key == "c":
                    object_format = "c"
                    has_c_module = True
                else:
                    object_format = fcompile.object_format
            else:
                if module.type_key == "llvm":
                    object_format = "o"
                else:
                    assert module.type_key == "c"
                    object_format = "c"
                    if "cc" in kwargs:
                        if kwargs["cc"] == "nvcc":
                            object_format = "cu"
                    has_c_module = True
            path_obj = os.path.join(workspace_dir,
                                    f"lib{index}.{object_format}")
            module.save(path_obj)
            files.append(path_obj)
            is_system_lib = (module.type_key == "llvm" and
                             module.get_function("__tvm_is_system_module")())
            llvm_target_triple = (module.type_key == "llvm" and
                                  module.get_function("_get_target_triple")())
        if not fcompile:
            if file_name.endswith(".tar"):
                fcompile = _tar.tar
            else:
                fcompile = _cc.create_shared

        if llvm_target_triple is None and hasattr(fcompile,
                                                  "get_target_triple"):
            llvm_target_triple = fcompile.get_target_triple()

        if getattr(fcompile, "need_system_lib", False) and not is_system_lib:
            raise ValueError("%s need --system-lib option" % str(fcompile))

        if self.imported_modules:
            if enabled("llvm") and llvm_target_triple:
                path_obj = os.path.join(workspace_dir, f"devc.{object_format}")
                m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib,
                                                     llvm_target_triple)
                m.save(path_obj)
                files.append(path_obj)
            else:
                path_cc = os.path.join(workspace_dir, "devc.c")
                with open(path_cc, "w") as f:
                    f.write(_ffi_api.ModulePackImportsToC(self, is_system_lib))
                files.append(path_cc)

        # The imports could contain a c module but the object format could be tar
        # Thus, it would not recognize the following include paths as options
        # which are there assuming a c compiler is the fcompile.
        if has_c_module and not file_name.endswith(".tar"):
            options = []
            if "options" in kwargs:
                opts = kwargs["options"]
                options = opts if isinstance(opts, (list, tuple)) else [opts]
            opts = options + ["-I" + path for path in find_include_path()]
            kwargs.update({"options": opts})

        return fcompile(file_name, files, **kwargs)
Esempio n. 12
0
def test_export_operator_model_library_format():
    import tvm.micro as micro

    target = tvm.target.target.micro("host")
    with tvm.transform.PassContext(opt_level=3,
                                   config={"tir.disable_vectorize": True}):
        A = tvm.te.placeholder((2, ), dtype="int8")
        B = tvm.te.placeholder((1, ), dtype="int8")
        C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
        sched = tvm.te.create_schedule(C.op)
        mod = tvm.build(
            sched,
            [A, B, C],
            tvm.target.Target(target, target),
            runtime=Runtime("crt", {"system-lib": True}),
            name="add",
        )

    temp_dir = utils.tempdir()
    mlf_tar_path = temp_dir.relpath("lib.tar")
    micro.export_model_library_format(mod, mlf_tar_path)

    tf = tarfile.open(mlf_tar_path)

    extract_dir = temp_dir.relpath("extract")
    os.mkdir(extract_dir)
    tf.extractall(extract_dir)

    with open(os.path.join(extract_dir, "metadata.json")) as json_f:
        metadata = json.load(json_f)
        assert metadata["version"] == 6
        assert metadata["model_name"] == "add"
        export_datetime = datetime.datetime.strptime(
            metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ")
        assert (datetime.datetime.now() -
                export_datetime) < datetime.timedelta(seconds=60 * 5)
        assert metadata["target"] == [str(target)]

        assert metadata["memory"]["add"][0]["dtype"] == "int8"
        assert metadata["memory"]["add"][0]["shape"] == [2]
        assert metadata["memory"]["add"][0]["size_bytes"] == 2

        assert metadata["memory"]["add"][1]["dtype"] == "int8"
        assert metadata["memory"]["add"][1]["shape"] == [1]
        assert metadata["memory"]["add"][1]["size_bytes"] == 1

        assert metadata["memory"]["add"][2]["dtype"] == "int8"
        assert metadata["memory"]["add"][2]["shape"] == [2]
        assert metadata["memory"]["add"][2]["size_bytes"] == 2

    assert os.path.exists(
        os.path.join(extract_dir, "codegen", "host", "src", "lib0.c"))
    assert os.path.exists(
        os.path.join(extract_dir, "codegen", "host", "src", "lib1.c"))

    assert (len(mod.ir_module_by_target) == 1
            ), f"expect 1 ir_model_by_target: {mod.ir_module_by_target!r}"
    for target, ir_mod in mod.ir_module_by_target.items():
        assert int(tvm.runtime.ndarray.device(str(target)).device_type) == 1
        with open(os.path.join(extract_dir, "src", "tir-1.txt")) as tir_f:
            assert tir_f.read() == str(ir_mod)
Esempio n. 13
0
def test_aot_executor(hexagon_launcher, hexagon_session):
    dtype = "float32"
    input_shape = (1, 128, 128, 3)
    w_shape = (5, 5, 3, 8)
    data = relay.var("data", relay.TensorType(input_shape, dtype))
    weight = relay.var("weight", relay.TensorType(w_shape, dtype))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    target_hexagon = tvm.target.hexagon("v68")
    temp = utils.tempdir()
    dso_binary = "test_binary.so"
    dso_binary_path = temp / dso_binary

    weight_data = np.random.rand(w_shape[0], w_shape[1], w_shape[2],
                                 w_shape[3]).astype(dtype=dtype)
    input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2],
                                input_shape[3]).astype(dtype=dtype)

    params = {"weight": weight_data}
    inputs = {"data": input_data}

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            params=params,
            target=tvm.target.Target(target_hexagon, host="c"),
            runtime=Runtime("cpp"),
            executor=Executor("aot", {
                "unpacked-api": False,
                "interface-api": "c"
            }),
        )
        # Uncomment this once the workaround is not needed.
        # lowered.export_library(
        #     dso_binary_path, fcompile=hexagon.create_aot_shared, hexagon_arch="v68"
        # )
        lowered.export_library(dso_binary_path,
                               fcompile=_workaround_create_aot_shared(),
                               hexagon_arch="v68")

    if hexagon_session is None:
        pytest.skip(
            msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")

    hexagon_launcher.upload(dso_binary_path, dso_binary)

    aot_mod = hexagon_launcher.get_aot_executor(dso_binary, hexagon_session)
    aot_mod.set_input(**inputs)
    aot_mod.run()
    hexagon_output = aot_mod.get_output(0).numpy()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=Runtime("cpp"),
            executor=Executor("graph"),
        )

    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
Esempio n. 14
0
def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
    dtype = "float32"
    input_shape = (1, 8, 8, 3)
    w1_shape = (5, 5, 3, 1)
    w2_shape = (5, 5, 1, 3)
    data = relay.var("data", relay.TensorType(input_shape, dtype))
    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
    y1 = relay.nn.conv2d(
        data,
        weight1,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    y2 = relay.nn.conv2d(
        y1,
        weight2,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight1, weight2], y2)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    target_hexagon = tvm.target.hexagon("v68")
    runtime = Runtime("cpp")
    executor = Executor("graph")

    temp = utils.tempdir()
    dso_binary = "test_binary.so"
    dso_binary_path = temp.relpath(dso_binary)

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_hexagon, host=target_hexagon),
            runtime=runtime,
            executor=executor,
        )
        lowered.get_lib().save(dso_binary_path)

    if hexagon_session is None:
        pytest.skip(
            msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")

    hexagon_launcher.upload(dso_binary_path, dso_binary)

    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2],
                                  w1_shape[3]).astype(dtype=dtype)
    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2],
                                  w2_shape[3]).astype(dtype=dtype)
    input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2],
                                input_shape[3]).astype(dtype=dtype)

    params = {"weight1": weight1_data, "weight2": weight2_data}
    inputs = {"data": input_data}

    graph_mod = hexagon_launcher.get_graph_executor(lowered.get_graph_json(),
                                                    dso_binary,
                                                    hexagon_session)
    graph_mod.set_input(**params)
    graph_mod.run(**inputs)
    hexagon_output = graph_mod.get_output(0).numpy()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=runtime,
            executor=executor,
        )
    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(
        llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output,
                                expected_output,
                                rtol=1e-4,
                                atol=1e-5)
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from tvm import relay
from tvm.relay import testing
import tvm
from tvm import te
import tvm.testing

from tvm.contrib import utils

header_file_dir_path = utils.tempdir()


def gen_engine_header():
    code = r"""
        #ifndef _ENGINE_H_
        #define _ENGINE_H_
        #include <cstdint>
        #include <string>
        #include <sstream>
        #include <vector>
        class Engine {
        };

        #endif
        """
Esempio n. 16
0
def test_mobilenet():
    temp = utils.tempdir()
    image, synset = prepare_input()
    model, params = get_model("mobilenetv2_1.0", image.shape)

    def run(mod, target):
        with relay.build_config(opt_level=3):
            lib = relay.build(mod,
                              target=target,
                              target_host=target_host,
                              params=params)
        path_dso = temp.relpath("deploy.dylib")
        lib.export_library(path_dso, xcode.create_dylib, arch=arch, sdk=sdk)
        xcode.codesign(path_dso)

        # Start RPC test server that contains the compiled library.
        xcode.popen_test_rpc(proxy_host,
                             proxy_port,
                             key,
                             destination=destination,
                             libs=[path_dso])

        # connect to the proxy
        remote = rpc.connect(proxy_host, proxy_port, key=key)

        if target == "metal":
            dev = remote.metal(0)
        else:
            dev = remote.cpu(0)
        lib = remote.load_module("deploy.dylib")
        m = graph_executor.GraphModule(lib["default"](dev))

        m.set_input("data", tvm.nd.array(image, dev))
        m.run()
        tvm_output = m.get_output(0)
        top1 = np.argmax(tvm_output.asnumpy()[0])
        print("TVM prediction top-1:", top1, synset[top1])

        # evaluate
        ftimer = m.module.time_evaluator("run", dev, number=3, repeat=10)
        prof_res = np.array(ftimer().results) * 1000
        print("%-19s (%s)" %
              ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))

    def annotate(func, compiler):
        """
        An annotator for Core ML.
        """
        # Bind free variables to the constant values.
        bind_dict = {}
        for arg in func.params:
            name = arg.name_hint
            if name in params:
                bind_dict[arg] = relay.const(params[name])

        func = relay.bind(func, bind_dict)

        # Annotate the entire graph for Core ML
        mod = tvm.IRModule()
        mod["main"] = func

        seq = tvm.transform.Sequential([
            transform.SimplifyInference(),
            transform.FoldConstant(),
            transform.FoldScaleAxis(),
            transform.AnnotateTarget(compiler),
            transform.MergeCompilerRegions(),
            transform.PartitionGraph(),
        ])

        with relay.build_config(opt_level=3):
            mod = seq(mod)

        return mod

    # CPU
    run(model, target_host)
    # Metal
    run(model, "metal")
    # CoreML
    run(annotate(model, "coremlcompiler"), target_host)
Esempio n. 17
0
def test_packed_global_variables():
    """Check packed global variables in codegen output."""
    dtype = "float32"
    ishape = (1, 32, 14, 14)
    wshape = (32, 32, 3, 3)
    interface_api = "packed"
    use_unpacked_api = False

    data0 = relay.var("data", shape=ishape, dtype=dtype)
    weight0 = relay.var("weight", shape=wshape, dtype=dtype)
    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=1)
    main_f = relay.Function([data0, weight0], out)
    mod = tvm.IRModule()
    mod["main"] = main_f
    mod = transform.InferType()(mod)

    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
    w1_data = np.random.uniform(0, 1, wshape).astype(dtype)

    inputs = OrderedDict([("data", i_data), ("weight", w1_data)])

    output_list = generate_ref_data(mod, inputs)
    compiled_models_list = compile_models(
        models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
        interface_api=interface_api,
        use_unpacked_api=use_unpacked_api,
        workspace_byte_alignment=8,
        enable_op_fusion=True,
        pass_config=AOT_DEFAULT_RUNNER.pass_config,
        use_runtime_executor=True,
        target=tvm.target.Target("c"),
    )
    compiled_model = compiled_models_list[0]

    tmp_path = utils.tempdir()
    base_path = tmp_path.temp_dir

    model = compiled_model.model
    tar_file = os.path.join(base_path, f"{model.name}.tar")
    export_model_library_format(compiled_model.executor_factory, tar_file)
    t = tarfile.open(tar_file)
    t.extractall(base_path)

    file_list = []
    for path in (pathlib.Path(base_path) / "codegen" / "host" / "src").iterdir():
        if path.is_file():
            file_list.append(path)
    assert len(file_list) > 0

    for path in file_list:
        with open(path, "r") as lib_f:
            lib1 = lib_f.readlines()

        tvmgen_names = []
        tvmgen_funcs = []
        for line in lib1:
            for item in line.split(" "):
                # Find all names starting with tvmgen_default
                if item.startswith("tvmgen_default"):
                    # Collect any name starting with tvmgen_default
                    tvmgen_names.append(item)
                    # Collect all functions starting with tvmgen_default
                    tvmgen_funcs += re.findall(r"(?<=).*(?=\()", item)

        # Check if any function name has a packed variable name in all items that start with tvmgen_default
        for func in tvmgen_funcs:
            assert f"{func}_packed" not in tvmgen_names
Esempio n. 18
0
def test_graph_executor(
    android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket
):
    dtype = "float32"
    data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
    weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    target_hexagon = tvm.target.hexagon("v68")
    runtime = Runtime("cpp")
    executor = Executor("graph")

    temp = utils.tempdir()
    dso_binary = "test_binary.so"
    dso_binary_path = temp.relpath(dso_binary)

    weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype)
    data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
    params = {"weight": weight_in}
    inputs = {"data": data_in}

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_hexagon, host=target_hexagon),
            runtime=runtime,
            executor=executor,
        )
        lowered.get_lib().save(dso_binary_path)

    if not android_serial_number:
        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")

    rpc_info = {
        "rpc_tracker_host": tvm_tracker_host,
        "rpc_tracker_port": tvm_tracker_port,
        "rpc_server_port": RPC_SERVER_PORT + 3,  # See note at the beginning of the file
        "adb_server_socket": adb_server_socket,
    }
    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
    launcher.upload(dso_binary_path, dso_binary)
    launcher.start_server()

    with launcher.start_session() as sess:
        graph_mod = launcher.get_graph_executor(lowered.get_graph_json(), dso_binary, sess)
        graph_mod.set_input(**params)
        graph_mod.run(**inputs)
        hexagon_output = graph_mod.get_output(0).numpy()

    launcher.stop_server()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=runtime,
            executor=executor,
        )
    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
Esempio n. 19
0
def _c_to_llvm(c_code: str) -> str:
    unique_filename = str(uuid.uuid4())
    temp = utils.tempdir()
    ll_path = temp.relpath(f"{unique_filename}.ll")
    ll_code = clang.create_llvm([c_code], output=ll_path)
    return ll_code
Esempio n. 20
0
def test_aot_executor_multiple_conv2d(
    tvm_tracker_host, tvm_tracker_port, android_serial_number, adb_server_socket
):
    dtype = "float32"
    input_shape = (1, 8, 8, 3)
    w1_shape = (5, 5, 3, 1)
    w2_shape = (5, 5, 1, 3)
    data = relay.var("data", relay.TensorType(input_shape, dtype))
    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
    y1 = relay.nn.conv2d(
        data,
        weight1,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    y2 = relay.nn.conv2d(
        y1,
        weight2,
        padding=(2, 2),
        kernel_size=(5, 5),
        data_layout="NHWC",
        kernel_layout="HWIO",
        out_dtype="float32",
    )
    f = relay.Function([data, weight1, weight2], y2)
    relay_mod = tvm.IRModule.from_expr(f)
    relay_mod = relay.transform.InferType()(relay_mod)

    target_hexagon = tvm.target.hexagon("v68")
    temp = utils.tempdir()
    dso_binary = "test_binary.so"
    dso_binary_path = temp / dso_binary

    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
        dtype=dtype
    )
    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype(
        dtype=dtype
    )
    input_data = np.random.rand(
        input_shape[0], input_shape[1], input_shape[2], input_shape[3]
    ).astype(dtype=dtype)

    params = {"weight1": weight1_data, "weight2": weight2_data}
    inputs = {"data": input_data}

    with tvm.transform.PassContext(opt_level=3):
        lowered = tvm.relay.build(
            relay_mod,
            params=params,
            target=tvm.target.Target(target_hexagon, host="c"),
            runtime=Runtime("cpp"),
            executor=Executor("aot", {"unpacked-api": False, "interface-api": "c"}),
        )
        # Uncomment this once the workaround is not needed.
        # lowered.export_library(
        #     dso_binary_path, fcompile=hexagon.create_aot_shared, hexagon_arch="v68"
        # )
        lowered.export_library(
            dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68"
        )

    if not android_serial_number:
        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")

    rpc_info = {
        "rpc_tracker_host": tvm_tracker_host,
        "rpc_tracker_port": tvm_tracker_port,
        "rpc_server_port": RPC_SERVER_PORT + 6,  # See note at the beginning of the file
        "adb_server_socket": adb_server_socket,
    }
    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
    launcher.upload(dso_binary_path, dso_binary)
    launcher.start_server()

    with launcher.start_session() as sess:
        aot_mod = launcher.get_aot_executor(dso_binary, sess)
        aot_mod.set_input(**inputs)
        aot_mod.run()
        hexagon_output = aot_mod.get_output(0).numpy()

    launcher.stop_server()

    target_llvm = tvm.target.Target("llvm")
    with tvm.transform.PassContext(opt_level=3):
        llvm_lowered = tvm.relay.build(
            relay_mod,
            tvm.target.Target(target_llvm, host=target_llvm),
            runtime=Runtime("cpp"),
            executor=Executor("graph"),
        )

    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
    llvm_graph_mod.set_input(**params)
    llvm_graph_mod.run(**inputs)
    expected_output = llvm_graph_mod.get_output(0).numpy()

    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
Esempio n. 21
0
def run_and_check(
    models: List[AOTCompiledTestModel],
    runner: AOTTestRunner,
    interface_api: str,
    debug_calculated_workspaces=False,
    workspace_byte_alignment=8,
    data_linkage: AOTDataLinkage = None,
):
    """
    This method uses the original test data and compiled runtime.Modules
    to run in the test runner to verify the results.
    """

    tmp_path = utils.tempdir()
    tmp_dir = tmp_path.temp_dir

    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
    # The calculated workspaces will not account for stack allocator tags used for debugging
    if debug_calculated_workspaces:
        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

    base_path = os.path.join(tmp_dir, "test")
    build_path = os.path.join(base_path, "build")
    os.makedirs(build_path, exist_ok=True)

    include_path = os.path.join(base_path, "include")
    os.mkdir(include_path)
    crt_root = tvm.micro.get_standalone_crt_dir()
    shutil.copy2(
        os.path.join(crt_root, "template", "crt_config-template.h"),
        os.path.join(include_path, "crt_config.h"),
    )

    workspace_bytes = 0
    for compiled_model in models:
        model = compiled_model.model
        tar_file = os.path.join(base_path, f"{model.name}.tar")
        export_model_library_format(compiled_model.executor_factory, tar_file)
        t = tarfile.open(tar_file)
        t.extractall(base_path)

        workspace_bytes += model.extra_memory_in_bytes
        if interface_api == "packed":
            workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)

        for key in model.inputs:
            sanitized_tensor_name = re.sub(r"\W", "_", key)
            create_header_file(
                f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
                model.inputs[key],
                include_path,
                data_linkage,
            )

        for i in range(len(model.outputs)):
            create_header_file(
                (f'{mangle_name(model.name,"output_data")}{i}'),
                np.zeros(model.outputs[i].shape, model.outputs[i].dtype),
                include_path,
                data_linkage,
            )
            create_header_file(
                (f'{mangle_name(model.name, "expected_output_data")}{i}'),
                model.outputs[i],
                include_path,
                data_linkage,
            )

    create_main(
        "test.c",
        models,
        build_path,
        runner.includes,
        runner.prologue,
        runner.epilogue,
        data_linkage,
        interface_api,
        workspace_bytes,
    )

    # Verify that compiles fine
    file_dir = os.path.dirname(os.path.abspath(__file__))
    codegen_path = os.path.join(base_path, "codegen")
    makefile = os.path.join(file_dir, f"{runner.makefile}.mk")
    custom_params = " ".join(
        [f" {param}='{value}'" for param, value in runner.parameters.items()])
    make_command = (
        f"make -f {makefile} build_dir={build_path}" + f" CFLAGS='{cflags}'" +
        f" TVM_ROOT={file_dir}/../../../.." + f" AOT_TEST_ROOT={file_dir}" +
        f" CODEGEN_ROOT={codegen_path}" +
        f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}" +
        custom_params)

    compile_log_path = os.path.join(build_path, "test_compile.log")
    compile_command = f"{make_command} aot_test_runner"
    ret = subprocess_log_output(compile_command, ".", compile_log_path)
    assert ret == 0

    # Verify that runs fine
    run_log_path = os.path.join(build_path, "test_run.log")
    run_command = f"{make_command} run"
    ret = subprocess_log_output(run_command, build_path, run_log_path)
    assert ret == 0

    with open(run_log_path) as run_log:
        assert AOT_SUCCESS_TOKEN in run_log.read()
Esempio n. 22
0
 def __init__(self, package_path: str):
     self._tmp_dir = utils.tempdir()
     self.package_path = package_path
     self.import_package(self.package_path)
Esempio n. 23
0
def run_and_check(
    models: List[AOTCompiledTestModel],
    runner: AOTTestRunner,
    interface_api: str,
    debug_calculated_workspaces=False,
    workspace_byte_alignment=8,
    data_linkage: AOTDataLinkage = None,
    test_dir: str = None,
    verbose: bool = False,
):
    """
    This method uses the original test data and compiled runtime.Modules
    to run in the test runner to verify the results.
    """

    base_path = test_dir
    if test_dir is None:
        tmp_path = utils.tempdir()
        tmp_dir = tmp_path.temp_dir
        base_path = os.path.join(tmp_dir, "test")

    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
    # The calculated workspaces will not account for stack allocator tags used for debugging
    if debug_calculated_workspaces:
        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

    base_path = os.path.abspath(base_path)
    build_path = os.path.join(base_path, "build")
    os.makedirs(build_path, exist_ok=True)

    include_path = os.path.join(base_path, "include")
    os.mkdir(include_path)
    crt_root = tvm.micro.get_standalone_crt_dir()
    shutil.copy2(
        os.path.join(crt_root, "template", "crt_config-template.h"),
        os.path.join(include_path, "crt_config.h"),
    )

    workspace_bytes = 0
    for compiled_model in models:
        model = compiled_model.model
        tar_file = os.path.join(base_path, f"{model.name}.tar")
        export_model_library_format(compiled_model.executor_factory, tar_file)
        t = tarfile.open(tar_file)
        t.extractall(base_path)

        # Interface C APIs does not need compiler generated
        # workspace to generate the test application, because
        # workspace size is codegen'd as a macro to
        # tvmgen_<model_name>.h.
        if interface_api != "c":
            workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)

        workspace_bytes += model.extra_memory_in_bytes
        for key in model.inputs:
            sanitized_tensor_name = re.sub(r"\W", "_", key)
            create_header_file(
                f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
                model.inputs[key],
                include_path,
                data_linkage,
            )

        for key in model.outputs:
            sanitized_tensor_name = re.sub(r"\W", "_", key)
            create_header_file(
                f'{mangle_name(model.name, "output_data")}_{sanitized_tensor_name}',
                np.zeros(model.outputs[key].shape, model.outputs[key].dtype),
                include_path,
                data_linkage,
            )
            create_header_file(
                f'{mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}',
                model.outputs[key],
                include_path,
                data_linkage,
            )

    use_usmp = runner.pass_config.get("tir.usmp.enable", False)
    # We only need the stack allocator if USMP is not used
    use_stack_allocator = not use_usmp

    create_main(
        "test.c",
        models,
        build_path,
        runner.includes,
        runner.prologue,
        runner.epilogue,
        data_linkage,
        interface_api,
        workspace_bytes,
        use_stack_allocator,
    )

    # Verify that compiles fine
    file_dir = os.path.dirname(os.path.abspath(__file__))
    codegen_path = os.path.join(base_path, "codegen")
    makefile = os.path.join(file_dir, f"{runner.makefile}.mk")
    fvp_dir = "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/"
    # TODO(@grant-arm): Remove once ci_cpu docker image has been updated to FVP_Corstone_SSE
    if not os.path.isdir(fvp_dir):
        fvp_dir = "/opt/arm/FVP_Corstone_SSE-300_Ethos-U55/models/Linux64_GCC-6.4/"
    custom_params = " ".join(
        [f" {param}='{value}'" for param, value in runner.parameters.items()])
    make_command = (
        f"make -f {makefile} build_dir={build_path}" + f" CFLAGS='{cflags}'" +
        f" TVM_ROOT={file_dir}/../../../.." + f" AOT_TEST_ROOT={file_dir}" +
        f" CODEGEN_ROOT={codegen_path}" +
        f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}" +
        f" FVP_DIR={fvp_dir}" + custom_params)

    compile_log_path = os.path.join(build_path, "test_compile.log")
    compile_command = f"{make_command} aot_test_runner"
    if verbose:
        print("Compile command:\n", compile_command)
    ret = subprocess_log_output(compile_command, ".", compile_log_path)
    assert ret == 0

    # Verify that runs fine
    run_log_path = os.path.join(build_path, "test_run.log")
    run_command = f"{make_command} run"
    if verbose:
        print("Run command:\n", run_command)
    ret = subprocess_log_output(run_command, build_path, run_log_path)
    assert ret == 0
    with open(run_log_path) as run_log:
        assert AOT_SUCCESS_TOKEN in run_log.read()
Esempio n. 24
0
def test_c_link_params():
    temp_dir = utils.tempdir()
    for dtype in LINKABLE_DTYPES:
        mod, param_init = _make_mod_and_params(dtype)
        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
        main_func = mod["main"]
        target = "c --link-params"
        with tvm.transform.PassContext(opt_level=3,
                                       config={"tir.disable_vectorize": True}):
            lib = tvm.relay.build(mod, target, params=param_init)
            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded

            src = lib.lib.get_source()
            lib.lib.save(temp_dir.relpath("test.c"), "c")
            c_dtype = _get_c_datatype(dtype)
            src_lines = src.split("\n")
            param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE))
            param_def = f"static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{"
            for i, line in enumerate(src_lines):
                if line == param_def:
                    i += 1
                    break
            else:
                assert False, f'did not find parameter definition "{param_def}":\n{src}'

            cursor = 0
            width = dtype_info(dtype).bits // 4 + 2
            if dtype.startswith("int"):
                width += 1  # Account for sign

            while "};" not in src_lines[i]:
                for match in HEX_NUM_RE.finditer(src_lines[i]):
                    assert match.group() == _format_c_value(
                        dtype, width, param[cursor]
                    ), (f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got '
                        f'"{match.group(0)}"; full p0 follows:\n{src}')
                    cursor += 1
                i += 1

            assert cursor == np.prod(param.shape)

            # Need a unique name per library to avoid dlopen caching the lib load.
            lib_path = temp_dir.relpath(f"test-{dtype}-linked.so")
            lib["remove_params"]().export_library(lib_path)
            lib_mod = tvm.runtime.load_module(lib_path)

            #            lib_mod = lib_factory['default']()
            graph = json.loads(lib.graph_json)
            for p in lib.params:
                _verify_linked_param(dtype, lib, lib_mod, graph, p)

            # Wrap in function to explicitly deallocate the runtime.
            def _run_linked(lib_mod):
                graph_rt = tvm.contrib.graph_executor.GraphModule(
                    lib_mod["default"](tvm.cpu(0)))
                graph_rt.set_input("rand_input",
                                   rand_input)  # NOTE: params not required.
                graph_rt.run()

                return graph_rt.get_output(0)

            linked_output = _run_linked(lib_mod)

        linked_params = lib.params
        with tvm.transform.PassContext(opt_level=3,
                                       config={"tir.disable_vectorize": True}):
            lib = tvm.relay.build(mod, "c", params=param_init)
            _, _, params = lib
            # Need a unique name per library to avoid dlopen caching the lib load.
            lib_path = temp_dir.relpath(f"test-{dtype}-unlinked.so")
            lib.export_library(lib_path)
            lib_mod = tvm.runtime.load_module(lib_path)

            def _run_unlinked(lib_mod):
                graph_rt = tvm.contrib.graph_executor.GraphModule(
                    lib_mod["default"](tvm.cpu(0)))
                graph_rt.set_input("rand_input", rand_input, **params)
                graph_rt.run()
                return graph_rt.get_output(0)

            unlinked_output = _run_unlinked(lib_mod)

        if "int" in dtype:
            np.testing.assert_equal(unlinked_output.asnumpy(),
                                    linked_output.asnumpy())
        else:
            np.testing.assert_allclose(unlinked_output.asnumpy(),
                                       linked_output.asnumpy())
Esempio n. 25
0
 def check_local(coreml_model):
     temp = utils.tempdir()
     compiled_model = xcode.compile_coreml(coreml_model,
                                           out_dir=temp.temp_dir)
     ctx = tvm.cpu(0)
     verify(coreml_model, compiled_model, ctx)
Esempio n. 26
0
def test_export_byoc_c_module():
    """Test BYOC flow when it produces DSO-exportable modules.

    NOTE the general BYOC flow is not fully supported by Model Library Format right now.
    """
    x = tvm.relay.var("x", shape=(10, 10))
    w0 = tvm.relay.var("w0", shape=(10, 10))
    w1 = tvm.relay.var("w1", shape=(10, 10))
    w2 = tvm.relay.var("w2", shape=(10, 10))
    w3 = tvm.relay.var("w3", shape=(10, 10))
    w4 = tvm.relay.var("w4", shape=(10, 10))
    w5 = tvm.relay.var("w5", shape=(10, 10))
    w6 = tvm.relay.var("w6", shape=(10, 10))
    w7 = tvm.relay.var("w7", shape=(10, 10))

    # C compiler
    z0 = tvm.relay.add(x, w0)
    p0 = tvm.relay.subtract(z0, w1)
    q0 = tvm.relay.multiply(p0, w2)

    z1 = tvm.relay.add(x, w3)
    p1 = tvm.relay.subtract(z1, w4)
    q1 = tvm.relay.multiply(p1, w5)

    # Other parts on TVM
    z2 = tvm.relay.add(x, w6)
    q2 = tvm.relay.subtract(z2, w7)

    r = tvm.relay.concatenate((q0, q1, q2), axis=0)
    f = tvm.relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
    mod = tvm.IRModule()
    ann = byoc.CcompilerAnnotator()
    mod["main"] = ann.visit(f)
    mod = tvm.relay.transform.PartitionGraph("mod_name")(mod)
    mod = tvm.relay.transform.InferType()(mod)

    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
        factory = tvm.relay.build(mod, tvm.target.target.micro("host"), runtime=Runtime("crt"))

    temp_dir = utils.tempdir()
    mlf_tar_path = temp_dir.relpath("lib.tar")

    from tvm import micro

    micro.export_model_library_format(factory, mlf_tar_path)

    with tarfile.open(mlf_tar_path, "r:*") as tf:
        tar_members = [ti.name for ti in tf.getmembers()]
        print("tar members", tar_members)
        assert "./metadata.json" in tar_members
        with tf.extractfile("./metadata.json") as f:
            metadata = json.load(f)
        main_md = metadata["memory"]["functions"]["main"]
        assert main_md == [
            {
                "constants_size_bytes": 0,
                "device": 1,
                "io_size_bytes": 4800,
                "workspace_size_bytes": 800,
            }
        ]
Esempio n. 27
0
def _listen_loop(sock, port, rpc_key, tracker_addr, load_library, custom_addr):
    """Listening loop of the server."""
    def _accept_conn(listen_sock, tracker_conn, ping_period=2):
        """Accept connection from the other places.

        Parameters
        ----------
        listen_sock: Socket
            The socket used by listening process.

        tracker_conn : connnection to tracker
            Tracker connection

        ping_period : float, optional
            ping tracker every k seconds if no connection is accepted.
        """
        old_keyset = set()
        # Report resource to tracker
        if tracker_conn:
            matchkey = base.random_key(rpc_key + ":")
            base.sendjson(
                tracker_conn,
                [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr])
            assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS
        else:
            matchkey = rpc_key

        unmatch_period_count = 0
        unmatch_timeout = 4
        # Wait until we get a valid connection
        while True:
            if tracker_conn:
                trigger = select.select([listen_sock], [], [], ping_period)
                if not listen_sock in trigger[0]:
                    base.sendjson(tracker_conn,
                                  [TrackerCode.GET_PENDING_MATCHKEYS])
                    pending_keys = base.recvjson(tracker_conn)
                    old_keyset.add(matchkey)
                    # if match key not in pending key set
                    # it means the key is acquired by a client but not used.
                    if matchkey not in pending_keys:
                        unmatch_period_count += 1
                    else:
                        unmatch_period_count = 0
                    # regenerate match key if key is acquired but not used for a while
                    if unmatch_period_count * ping_period > unmatch_timeout + ping_period:
                        logger.info(
                            "no incoming connections, regenerate key ...")
                        matchkey = base.random_key(rpc_key + ":", old_keyset)
                        base.sendjson(tracker_conn, [
                            TrackerCode.PUT, rpc_key,
                            (port, matchkey), custom_addr
                        ])
                        assert base.recvjson(
                            tracker_conn) == TrackerCode.SUCCESS
                        unmatch_period_count = 0
                    continue
            conn, addr = listen_sock.accept()
            magic = struct.unpack("<i", base.recvall(conn, 4))[0]
            if magic != base.RPC_MAGIC:
                conn.close()
                continue
            keylen = struct.unpack("<i", base.recvall(conn, 4))[0]
            key = py_str(base.recvall(conn, keylen))
            arr = key.split()
            expect_header = "client:" + matchkey
            server_key = "server:" + rpc_key
            if arr[0] != expect_header:
                conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH))
                conn.close()
                logger.warning("mismatch key from %s", addr)
                continue
            conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS))
            conn.sendall(struct.pack("<i", len(server_key)))
            conn.sendall(server_key.encode("utf-8"))
            return conn, addr, _parse_server_opt(arr[1:])

    # Server logic
    tracker_conn = None
    while True:
        try:
            # step 1: setup tracker and report to tracker
            if tracker_addr and tracker_conn is None:
                tracker_conn = base.connect_with_retry(tracker_addr)
                tracker_conn.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC))
                magic = struct.unpack("<i", base.recvall(tracker_conn, 4))[0]
                if magic != base.RPC_TRACKER_MAGIC:
                    raise RuntimeError("%s is not RPC Tracker" %
                                       str(tracker_addr))
                # report status of current queue
                cinfo = {"key": "server:" + rpc_key}
                base.sendjson(tracker_conn, [TrackerCode.UPDATE_INFO, cinfo])
                assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS

            # step 2: wait for in-coming connections
            conn, addr, opts = _accept_conn(sock, tracker_conn)
        except (socket.error, IOError):
            # retry when tracker is dropped
            if tracker_conn:
                tracker_conn.close()
                tracker_conn = None
            continue
        except RuntimeError as exc:
            raise exc

        # step 3: serving
        work_path = utils.tempdir()
        logger.info("connection from %s", addr)
        server_proc = multiprocessing.Process(target=_serve_loop,
                                              args=(conn, addr, load_library,
                                                    work_path))

        server_proc.start()
        # close from our side.
        conn.close()
        # wait until server process finish or timeout
        server_proc.join(opts.get("timeout", None))

        if server_proc.is_alive():
            logger.info("Timeout in RPC session, kill..")
            # pylint: disable=import-outside-toplevel
            import psutil

            parent = psutil.Process(server_proc.pid)
            # terminate worker children
            for child in parent.children(recursive=True):
                child.terminate()
            # terminate the worker
            server_proc.terminate()
        work_path.remove()
Esempio n. 28
0
def test_gemm_gpu(N, times, bn, num_block, num_thread):
    assert bn <= N
    assert num_thread * num_thread * 16 <= N
    assert num_block * num_block * 2 <= N
    A = te.placeholder((N, N), name="A")
    B = te.placeholder((N, N), name="Btmp")
    k = te.reduce_axis((0, N), name="k")

    packedB = te.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name="B")

    C = te.compute(
        (N, N), lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name="C"
    )

    s = te.create_schedule(C.op)
    CC = s.cache_write(C, "local")

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")

    thread_xz = te.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = te.thread_axis((0, 2), "vthread", name="vy")

    pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread)
    pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread)
    s[packedB].bind(pby, thread_y)
    s[packedB].bind(pbx, thread_x)
    pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8)
    s[packedB].vectorize(pbk)

    by, yi = s[C].split(C.op.axis[0], nparts=num_block)
    bx, xi = s[C].split(C.op.axis[1], nparts=num_thread)

    s[C].bind(by, block_y)
    s[C].bind(bx, thread_y)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_block)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)

    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)

    s[C].bind(ty, block_x)
    s[C].bind(tx, thread_x)

    xyi, xxi = s[C].split(xi, factor=8)
    s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi)
    s[C].vectorize(xxi)

    s[CC].compute_at(s[C], yi)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)
    xo, xi = s[CC].split(xo, factor=8)
    s[CC].vectorize(xi)

    ko, ki = s[CC].split(k, factor=2)
    s[CC].unroll(ki)

    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
    temp = utils.tempdir()
    path_dso = temp.relpath("gemm_gpu.so")
    f.export_library(path_dso, ndk.create_shared)

    # connect to the proxy
    remote = rpc.connect(proxy_host, proxy_port, key=key)
    ctx = remote.cl(0)
    remote.upload(path_dso)
    f = remote.load_module("gemm_gpu.so")

    evaluate(f, ctx, N, times)
Esempio n. 29
0
    def export_library(self,
                       file_name,
                       fcompile=None,
                       addons=None,
                       workspace_dir=None,
                       **kwargs):
        """
        Export the module and all imported modules into a single device library.

        This function only works on host LLVM modules, other runtime::Module
        subclasses will work with this API but they must support implement
        the save and load mechanisms of modules completely including saving
        from streams and files. This will pack your non-shared library module
        into a single shared library which can later be loaded by TVM.

        Parameters
        ----------
        file_name : str
            The name of the shared library.

        fcompile : function(target, file_list, kwargs), optional
            The compilation function to use create the final library object during
            export.

            For example, when fcompile=_cc.create_shared, or when it is not supplied but
            module is "llvm," this is used to link all produced artifacts
            into a final dynamic library.

            This behavior is controlled by the type of object exported.
            If fcompile has attribute object_format, will compile host library
            to that format. Otherwise, will use default format "o".

        workspace_dir : str, optional
            The path of the directory used to create the intermediate
            artifacts when exporting the module.
            If this is not provided a temporary dir will be created.

        kwargs : dict, optional
            Additional arguments passed to fcompile

        Returns
        -------
        result of fcompile()  : unknown, optional
            If the compilation function returns an artifact it would be returned via
            export_library, if any.
        """
        # NOTE: this function depends on contrib library features
        # which are only available in when TVM function is available.
        if _RUNTIME_ONLY:
            raise RuntimeError(
                "Cannot call export_library in runtime only mode")
        # Extra dependencies during runtime.
        from pathlib import Path
        from tvm.contrib import cc as _cc, tar as _tar, utils as _utils

        if isinstance(file_name, Path):
            file_name = str(file_name)

        if self.type_key == "stackvm":
            if not file_name.endswith(".stackvm"):
                raise ValueError(
                    "Module[%s]: can only be saved as stackvm format."
                    "did you build with LLVM enabled?" % self.type_key)
            self.save(file_name)
            return

        modules = self._collect_dso_modules()
        if workspace_dir is None:
            temp = _utils.tempdir()
            workspace_dir = temp.temp_dir
        files = addons if addons else []
        is_system_lib = False
        has_c_module = False
        llvm_target_string = None
        for index, module in enumerate(modules):
            if fcompile is not None and hasattr(fcompile, "object_format"):
                if module.type_key == "c":
                    assert module.format in [
                        "c",
                        "cc",
                        "cpp",
                        "cu",
                    ], "The module.format needs to be either c, cc, cpp or cu."
                    object_format = module.format
                    has_c_module = True
                else:
                    object_format = fcompile.object_format
            else:
                if module.type_key == "c":
                    if len(module.format) > 0:
                        assert module.format in [
                            "c",
                            "cc",
                            "cpp",
                            "cu",
                        ], "The module.format needs to be either c, cc, cpp, or cu."
                        object_format = module.format
                    else:
                        object_format = "c"
                    if "cc" in kwargs:
                        if kwargs["cc"] == "nvcc":
                            object_format = "cu"
                    has_c_module = True
                else:
                    assert module.type_key == "llvm" or module.type_key == "static_library"
                    object_format = "o"
            path_obj = os.path.join(workspace_dir,
                                    f"lib{index}.{object_format}")
            module.save(path_obj)
            files.append(path_obj)
            is_system_lib = (module.type_key == "llvm" and
                             module.get_function("__tvm_is_system_module")())
            llvm_target_string = (module.type_key == "llvm" and
                                  module.get_function("_get_target_string")())
        if not fcompile:
            if file_name.endswith(".tar"):
                fcompile = _tar.tar
            else:
                fcompile = _cc.create_shared

        if llvm_target_string is None and hasattr(fcompile,
                                                  "get_target_triple"):
            triple = fcompile.get_target_triple()
            assert triple, "Target triple should not be empty"
            llvm_target_string = "llvm -mtriple " + triple

        if getattr(fcompile, "need_system_lib", False) and not is_system_lib:
            raise ValueError("%s need --system-lib option" % str(fcompile))

        if self.imported_modules:
            if enabled("llvm") and llvm_target_string:
                path_obj = os.path.join(workspace_dir, f"devc.{object_format}")
                m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib,
                                                     llvm_target_string)
                m.save(path_obj)
                files.append(path_obj)
            else:
                path_cc = os.path.join(workspace_dir, "devc.c")
                with open(path_cc, "w") as f:
                    f.write(_ffi_api.ModulePackImportsToC(self, is_system_lib))
                files.append(path_cc)

        # The imports could contain a c module but the object format could be tar
        # Thus, it would not recognize the following include paths as options
        # which are there assuming a c compiler is the fcompile.
        if has_c_module and not file_name.endswith(".tar"):
            options = []
            if "options" in kwargs:
                opts = kwargs["options"]
                options = opts if isinstance(opts, (list, tuple)) else [opts]
            opts = options + ["-I" + path for path in find_include_path()]
            kwargs.update({"options": opts})

        return fcompile(file_name, files, **kwargs)
Esempio n. 30
0
def test_c_link_params(linkable_dtype):
    temp_dir = utils.tempdir()
    mod, param_init = _make_mod_and_params(linkable_dtype)
    rand_input = _make_random_tensor(linkable_dtype, INPUT_SHAPE)
    main_func = mod["main"]
    target = "c"
    executor = Executor("graph", {"link-params": True})
    with tvm.transform.PassContext(opt_level=3,
                                   config={"tir.disable_vectorize": True}):
        lib = tvm.relay.build(mod,
                              target,
                              executor=executor,
                              params=param_init)
        assert len(lib.params.keys()) == 0  # NOTE: params became tir.constants

        src = lib.lib.get_source()
        lib.lib.save(temp_dir.relpath("test.c"), "c")
        c_dtype = _get_c_datatype(linkable_dtype)
        src_lines = src.split("\n")
        param = param_init[f"{linkable_dtype}_a"].reshape(
            np.prod(KERNEL_SHAPE))
        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) constant_\d+\[{np.prod(param.shape)}\] = {{$"
        for i, line in enumerate(src_lines):
            if re.match(param_def, line):
                i += 1
                break
        else:
            assert False, f'did not find parameter definition "{param_def}":\n{src}'

        cursor = 0
        width = dtype_info(linkable_dtype).bits // 4 + 2
        if linkable_dtype.startswith("int"):
            width += 1  # Account for sign

        while "};" not in src_lines[i]:
            for match in HEX_NUM_RE.finditer(src_lines[i]):
                cursor += 1
            i += 1

        assert cursor == np.prod(param.shape)

        # Need a unique name per library to avoid dlopen caching the lib load.
        lib_path = temp_dir.relpath(f"test-{linkable_dtype}-linked.so")
        lib["remove_params"]().export_library(lib_path)
        lib_mod = tvm.runtime.load_module(lib_path)

        #            lib_mod = lib_factory['default']()
        graph = json.loads(lib.graph_json)
        for p in lib.params:
            _verify_linked_param(linkable_dtype, lib, lib_mod, graph, p)

        # Wrap in function to explicitly deallocate the runtime.
        def _run_linked(lib_mod):
            graph_rt = tvm.contrib.graph_executor.GraphModule(
                lib_mod["default"](tvm.cpu(0)))
            graph_rt.set_input("rand_input",
                               rand_input)  # NOTE: params not required.
            graph_rt.run()

            return graph_rt.get_output(0)

        linked_output = _run_linked(lib_mod)

    linked_params = lib.params
    with tvm.transform.PassContext(opt_level=3,
                                   config={"tir.disable_vectorize": True}):
        lib = tvm.relay.build(mod, "c", params=param_init)
        _, _, params = lib
        # Need a unique name per library to avoid dlopen caching the lib load.
        lib_path = temp_dir.relpath(f"test-{linkable_dtype}-unlinked.so")
        lib.export_library(lib_path)
        lib_mod = tvm.runtime.load_module(lib_path)

        def _run_unlinked(lib_mod):
            graph_rt = tvm.contrib.graph_executor.GraphModule(
                lib_mod["default"](tvm.cpu(0)))
            graph_rt.set_input("rand_input", rand_input, **params)
            graph_rt.run()
            return graph_rt.get_output(0)

        unlinked_output = _run_unlinked(lib_mod)

    if "int" in linkable_dtype:
        np.testing.assert_equal(unlinked_output.numpy(), linked_output.numpy())
    else:
        np.testing.assert_allclose(unlinked_output.numpy(),
                                   linked_output.numpy())