Exemple #1
0
def test_graph_executor(target, dev):
    mod, params = mlp.get_workload(1)

    exe = relay.build(mod, target, params=params)
    gr = debug_executor.create(exe.get_json(), exe.lib, dev)

    data = np.random.rand(1, 1, 28, 28).astype("float32")
    report = gr.profile(data=data)
    assert "fused_nn_softmax" in report
    assert "Total time" in report
Exemple #2
0
def test_run_single_node(graph, n, A, myadd):
    mlib_proxy = tvm.support.FrontendTestModule()
    mlib_proxy["myadd"] = myadd
    mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))

    a = np.random.uniform(size=(n,)).astype(A.dtype)
    mod.set_input(x=a)

    assert len(mod.debug_datum.get_graph_nodes()) == 2
    assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
    assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"

    # Running a node with no associated function should return instantly and have 0 runtime
    assert mod.run_individual_node(0, number=1).mean == 0

    # Meanwhile the actual function should take some time, more time if you run it more times
    repeat_1_result = mod.run_individual_node(1, repeat=1)
    assert repeat_1_result.mean > 0

    # Running multiple times (10) should take longer than 1 time
    repeat_3_results = mod.run_individual_node(1, repeat=3)
    assert sum(repeat_3_results.results) > sum(repeat_1_result.results)

    # Increasing the number of repeats should give you the number of results asked for
    assert len(mod.run_individual_node(1, repeat=10).results) == 10

    # Doing repeat_ms should have the run time greater than the asked amount
    start = time.time()
    mod.run_individual_node(1, min_repeat_ms=500)
    end = time.time()
    elapsed_time_in_seconds = end - start
    assert elapsed_time_in_seconds >= 0.5

    # Doing `cooldown_interval_ms` should have the execution time increases
    start = time.time()
    mod.run_individual_node(1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000)
    end = time.time()
    elapsed_time_in_seconds_with_def_rep = end - start
    assert elapsed_time_in_seconds_with_def_rep >= 3

    # Doing with `repeats_to_cooldown` not equal 1 should not trigger
    # cooldown after each repeat
    start = time.time()
    mod.run_individual_node(
        1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000, repeats_to_cooldown=2
    )
    end = time.time()
    elapsed_time_in_seconds_with_rep_2 = end - start
    assert elapsed_time_in_seconds_with_rep_2 >= 2 and (
        elapsed_time_in_seconds_with_rep_2 < elapsed_time_in_seconds_with_def_rep
    )

    # Going out of bounds of node index throws a tvm error
    with pytest.raises(TVMError):
        mod.run_individual_node(2)
    def f_per_layer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.debugger.debug_executor import create

        # pylint: enable=import-outside-toplevel
        mod = create(graph, rt_mod, dev)
        mod.set_input(input_name, input_data)
        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
        print("|graph_nodes| = ", len(graph_nodes))
        print("|graph_time| = ", len(graph_time))
        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
        for k, v in graph_nodes_time.items():
            print(f"{k} : {v:.3f}")
Exemple #4
0
def test_rpc_graph():
    server = rpc.Server(key="profiling")
    remote = rpc.connect("127.0.0.1", server.port, key="profiling")

    mod, params = mlp.get_workload(1)
    exe = relay.build(mod, "llvm", params=params)
    temp = utils.tempdir()
    path = temp.relpath("lib.tar")
    exe.export_library(path)
    remote.upload(path)
    rexec = remote.load_module("lib.tar")

    gr = debug_executor.create(exe.get_graph_json(), rexec, remote.cpu())

    data = np.random.rand(1, 1, 28, 28).astype("float32")
    report = gr.profile(data=data)
    assert len(report.calls) > 0
    def _build_tvm(self, debug_runtime=False):
        # compile kernels with history best records
        with autotvm.apply_history_best(self.log_file):
            with tvm.transform.PassContext(opt_level=3):
                self.tvm_graph, self.tvm_lib, self.tvm_params = relay.build(
                    self.mod, target=self.target, params=self.params)

        if not debug_runtime:
            self.tvm_module = graph_executor.create(self.tvm_graph,
                                                    self.tvm_lib,
                                                    device=self.dev)
        else:
            self.tvm_module = debug_executor.create(self.tvm_graph,
                                                    self.tvm_lib,
                                                    device=self.dev)
        self.tvm_module.set_input(**self.tvm_params)
        return self.tvm_module
 def check_remote(server):
     mlib = tvm.build(s, [A, B], "llvm", name="myadd")
     remote = rpc.connect(server.host, server.port)
     temp = utils.tempdir()
     dev = remote.cpu(0)
     path_dso = temp.relpath("dev_lib.so")
     mlib.export_library(path_dso)
     remote.upload(path_dso)
     mlib = remote.load_module("dev_lib.so")
     try:
         mod = debug_executor.create(graph, mlib, remote.cpu(0))
     except ValueError:
         print("Skip because debug runtime not enabled")
         return
     a = np.random.uniform(size=(n, )).astype(A.dtype)
     mod.run(x=tvm.nd.array(a, dev))
     out = tvm.nd.empty((n, ), device=dev)
     out = mod.get_output(0, out)
     np.testing.assert_equal(out.numpy(), a + 1)
Exemple #7
0
    def f_time_per_layer(
        rt_mod: tvm.runtime.Module,
        dev: tvm.device,
        input_data: Dict[str, NDArray],
    ) -> None:
        """Run and benchmark the per-layer performance of given runtime module,
        print out the result.

        Parameters
        ----------
        rt_mod : tvm.runtime.Module
            The runtime module.
        dev : tvm.device
            The device type to run workload.
        input_data : Dict[str, np.ndarray]
            The input data as a dictionary.
        """
        # pylint:disable=import-outside-toplevel
        from tvm.contrib.debugger.debug_executor import create

        # pylint:enable=import-outside-toplevel

        try:
            mod = create(graph, rt_mod, dev)
            for input_name, input_value in input_data.items():
                mod.set_input(input_name, input_value)
            graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
            graph_time = mod.run_individual(number=10,
                                            repeat=1,
                                            min_repeat_ms=5000)

            print("Running time of each layer:")
            print("---------------------------")
            print("|graph_nodes| = ", len(graph_nodes))
            print("|graph_time| = ", len(graph_time))

            for k, v in zip(graph_nodes, graph_time):
                print(k, float(v) * 1e6, "us")
        except Exception as exc:  # pylint: disable=broad-except
            print(
                f"Run module f_time_per_layer via RPC failed, exception: {exc}",
            )
    def check_verify():
        mlib = tvm.build(s, [A, B], "llvm", name="myadd")

        def myadd(*args):
            to_return = mlib["myadd"](*args)
            time.sleep(0.25)
            return to_return

        mlib_proxy = tvm.support.FrontendTestModule()
        mlib_proxy["myadd"] = myadd
        try:
            mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
        except ValueError:
            return

        a = np.random.uniform(size=(n, )).astype(A.dtype)
        mod.set_input(x=a)

        # verify dumproot created
        directory = mod._dump_path
        assert os.path.exists(directory)

        # verify graph is there
        GRAPH_DUMP_FILE_NAME = "_tvmdbg_graph_dump.json"
        assert len(os.listdir(directory)) == 1

        # verify the file name is proper
        graph_dump_path = os.path.join(directory, GRAPH_DUMP_FILE_NAME)
        assert os.path.exists(graph_dump_path)

        # verify the graph contains some expected keys
        with open(graph_dump_path) as graph_f:
            dumped_graph = json.load(graph_f)

        assert isinstance(dumped_graph, dict)
        for k in ("nodes", "arg_nodes", "node_row_ptr", "heads", "attrs"):
            assert k in dumped_graph, f"key {k} not in dumped graph {graph!r}"

        mod.run()
        # Verify the tensors are dumped
        assert len(os.listdir(directory)) > 1

        debug_lines = mod.debug_datum.get_debug_result().split("\n")

        def split_debug_line(i):
            to_return = re.split(r"  [ ]*", debug_lines[i])
            assert to_return[-1] == ""
            to_return = to_return[:-1]  # strip empty trailing part
            return to_return

        assert split_debug_line(0) == [
            "Node Name",
            "Ops",
            "Time(us)",
            "Time(%)",
            "Shape",
            "Inputs",
            "Outputs",
        ]
        myadd_lines = split_debug_line(2)
        assert myadd_lines[0] == "add"
        assert myadd_lines[1] == "myadd"
        runtime_sec = float(myadd_lines[2]) / 1e6  # printed in us

        # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude.
        # Here we just care that the prefix is correct.
        assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000

        total_lines = split_debug_line(3)
        assert total_lines[0] == "Total_time"
        assert total_lines[2] == myadd_lines[2]

        CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json"
        assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME))

        with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f:
            trace = json.load(f)
        assert trace["displayTimeUnit"] == "ns"
        events = trace["traceEvents"]
        assert len(events) == 4
        assert all(event["ph"] in ("B", "E") for event in events)
        assert all(event["pid"] == 1 for event in events)
        assert all(event["tid"] == 1 for event in events)
        assert all(event["name"] == "x" for event in events[:2])
        assert all(event["name"] == "add" for event in events[2:])
        assert events[0]["ts"] == 0
        assert events[0]["ph"] == "B"

        # verify the output is correct
        out = mod.get_output(0, tvm.nd.empty((n, )))
        np.testing.assert_equal(out.asnumpy(), a + 1)

        mod.exit()
        # verify dump root delete after cleanup
        assert not os.path.exists(directory)
Exemple #9
0
def run_module(
    tvmc_package: TVMCPackage,
    device: str,
    hostname: Optional[str] = None,
    port: Union[int, str] = 9090,
    rpc_key: Optional[str] = None,
    inputs: Optional[Dict[str, np.ndarray]] = None,
    fill_mode: str = "random",
    repeat: int = 10,
    number: int = 10,
    profile: bool = False,
    end_to_end: bool = False,
    options: dict = None,
):
    """Run a compiled graph executor module locally or remotely with
    optional input values.

    If input tensors are not specified explicitly, they can be filled
    with zeroes, ones or random data.

    Parameters
    ----------
    tvmc_package: TVMCPackage
        The compiled model package object that will be run.
    device: str,
        the device (e.g. "cpu" or "cuda") to be targeted by the RPC
        session, local or remote).
    hostname : str, optional
        The hostname of the target device on which to run.
    port : int, optional
        The port of the target device on which to run.
    rpc_key : str, optional
        The tracker key of the target device. If this is set, it
        will be assumed that remote points to a tracker.
    inputs : dict, optional
        A dictionary that maps input names to numpy values. If not provided,
        inputs will be generated using the fill_mode argument.
    fill_mode : str, optional
        The fill-mode to use when generating data for input tensors.
        Valid options are "zeros", "ones" and "random".
        Defaults to "random".
    repeat : int, optional
        How many times to repeat the run.
    number : int, optional
        The number of runs to measure within each repeat.
    profile : bool
        Whether to profile the run with the debug executor.
    end_to_end : bool
        Whether to measure the time of memory copies as well as model
        execution. Turning this on can provide a more realistic estimate
        of how long running the model in production would take.

    Returns
    -------
    outputs : dict
        a dictionary with output tensors, generated by the module
    times : list of str
        execution times generated by the time evaluator
    """
    if not isinstance(tvmc_package, TVMCPackage):
        raise TVMCException(
            "This model doesn't seem to have been compiled yet. "
            "Try calling tvmc.compile on the model before running it.")

    with ExitStack() as stack:
        # Currently only two package formats are supported: "classic" and
        # "mlf". The later can only be used for micro targets, i.e. with microTVM.
        if device == "micro":
            if tvmc_package.type != "mlf":
                raise TVMCException(
                    f"Model {tvmc_package.package_path} is not a MLF archive.")

            project_dir = get_project_dir(tvmc_package.project_dir)

            # This is guaranteed to work since project_dir was already checked when
            # building the dynamic parser to accommodate the project options, so no
            # checks are in place when calling GeneratedProject.
            project_ = project.GeneratedProject.from_directory(
                project_dir, options)
        else:
            if tvmc_package.type == "mlf":
                raise TVMCException(
                    "You're trying to run a model saved using the Model Library Format (MLF). "
                    "MLF can only be used to run micro device ('--device micro')."
                )

        if hostname:
            if isinstance(port, str):
                port = int(port)
            # Remote RPC
            if rpc_key:
                logger.debug("Running on remote RPC tracker with key %s.",
                             rpc_key)
                session = request_remote(rpc_key, hostname, port, timeout=1000)
            else:
                logger.debug("Running on remote RPC with no key.")
                session = rpc.connect(hostname, port)
        elif device == "micro":
            # Remote RPC (running on a micro target)
            logger.debug("Running on remote RPC (micro target).")
            try:
                session = tvm.micro.Session(project_.transport())
                stack.enter_context(session)
            except:
                raise TVMCException(
                    "Could not open a session with the micro target.")
        else:
            # Local
            logger.debug("Running a local session.")
            session = rpc.LocalSession()

        # Micro targets don't support uploading a model. The model to be run
        # must be already flashed into the micro target before one tries
        # to run it. Hence skip model upload for micro targets.
        if device != "micro":
            session.upload(tvmc_package.lib_path)
            lib = session.load_module(tvmc_package.lib_name)

        # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
        logger.debug("Device is %s.", device)
        if device == "cuda":
            dev = session.cuda()
        elif device == "cl":
            dev = session.cl()
        elif device == "metal":
            dev = session.metal()
        elif device == "vulkan":
            dev = session.vulkan()
        elif device == "rocm":
            dev = session.rocm()
        elif device == "micro":
            dev = session.device
            lib = session.get_system_lib()
        else:
            assert device == "cpu"
            dev = session.cpu()

        if tvmc_package.type == "vm":
            assert inputs is not None, "vm runner requires inputs to be provided as a dict"

            input_tensor = {}
            for e, i in inputs.items():
                input_tensor[e] = tvm.nd.array(i, dev)

            if profile:
                logger.debug("Creating vm with profile enabled.")
                exe = profiler_vm.VirtualMachineProfiler(lib, dev)
                res = exe.profile(**input_tensor, func_name="main")
                # This print is intentional
                print(res)
            else:
                exe = vm.VirtualMachine(lib, dev)

            exe_outputs = exe.invoke("main", **input_tensor)
            times = exe.benchmark(
                dev,
                **input_tensor,
                func_name="main",
                repeat=repeat,
                number=number,
                end_to_end=end_to_end,
            )

            # Special handling if the output only has a single value
            if not isinstance(exe_outputs, list):
                exe_outputs = [exe_outputs]

            outputs = {}
            for i, val in enumerate(exe_outputs):
                output_name = "output_{}".format(i)
                outputs[output_name] = val.numpy()
        else:
            # TODO(gromero): Adjust for micro targets.
            if profile:
                logger.debug("Creating runtime with profiling enabled.")
                module = debug_executor.create(tvmc_package.graph,
                                               lib,
                                               dev,
                                               dump_root="./prof")
            else:
                if device == "micro":
                    logger.debug(
                        "Creating runtime (micro) with profiling disabled.")
                    module = tvm.micro.create_local_graph_executor(
                        tvmc_package.graph, lib, dev)
                else:
                    logger.debug("Creating runtime with profiling disabled.")
                    module = executor.create(tvmc_package.graph, lib, dev)

            logger.debug("Loading params into the runtime module.")
            module.load_params(tvmc_package.params)

            logger.debug("Collecting graph input shape and type:")
            shape_dict, dtype_dict = module.get_input_info()
            logger.debug("Graph input shape: %s", shape_dict)
            logger.debug("Graph input type: %s", dtype_dict)

            inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs,
                                           fill_mode)

            logger.debug("Setting inputs to the module.")
            module.set_input(**inputs_dict)

            # Run must be called explicitly if profiling
            if profile:
                logger.info("Running the module with profiling enabled.")
                report = module.profile()
                # This print is intentional
                print(report)

            if device == "micro":
                # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
                # fixed module.benchmark() can be used instead and this if/else can
                # be removed.
                module.run()
                times = []
            else:
                # Call the benchmarking function of the executor.
                # Optionally measure e2e data transfers from the
                # CPU to device memory overheads (e.g. PCIE
                # overheads if the device is a discrete GPU).
                if end_to_end:
                    dev = session.cpu()
                times = module.benchmark(dev,
                                         number=number,
                                         repeat=repeat,
                                         end_to_end=end_to_end)

            logger.debug("Collecting the output tensors.")
            num_outputs = module.get_num_outputs()
            outputs = {}
            for i in range(num_outputs):
                output_name = "output_{}".format(i)
                outputs[output_name] = module.get_output(i).numpy()

        return TVMCResult(outputs, times)
    #     print(len(task.config_space))

    # tune_kernels(tasks, **tuning_option) # tuning
    # tune_graph(mod["main"], data_shape, 'unet_cpu_2_thread.log', graph_opt_sch_file,exec_num = 1000) # tuning

    # # #只需要得到这个opt_sch_file就可
    with autotvm.apply_graph_best(graph_opt_sch_file):  # graph_opt_sch_file
        print("compile...")
        with tvm.transform.PassContext(opt_level=3):  # set < 3
            # lib = relay.build_module.build(mod,target,params = params)
            lib = relay.build(mod, target, params=params)
        # m = graph_executor.GraphModule(lib["default"](dev))
        with open(graph_opt_sch_file, 'r') as f:
            graph = f.read()
        m = graph_executor.create(graph,
                                  lib['default'],
                                  dev,
                                  dump_root="/tmp/tvmdbg")
        # set input and get_output
        m.set_input(input_name,
                    tvm.nd.array(x.astype(dtype)))  # input_name = 'x'
        # must set 'x' as input here due to previous channel translating
        # automatically change our original model input name
        # And here have to maintain the correspondence between
        # real img size,data type and the model's inputs'

        # evaluate
        # print("Evaluate inference time cost...")
        # ftimer = m.module.time_evaluator("run", dev, number=10, repeat=3) # a easy one
        # prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        # print(
        #     "Mean inference time (std dev): %.2f ms (%.2f ms)"
Exemple #11
0
def run_module(
    tvmc_package: TVMCPackage,
    device: str,
    hostname: Optional[str] = None,
    port: Union[int, str] = 9090,
    rpc_key: Optional[str] = None,
    inputs: Optional[Dict[str, np.ndarray]] = None,
    fill_mode: str = "random",
    repeat: int = 10,
    number: int = 10,
    profile: bool = False,
):
    """Run a compiled graph executor module locally or remotely with
    optional input values.

    If input tensors are not specified explicitly, they can be filled
    with zeroes, ones or random data.

    Parameters
    ----------
    tvmc_package: TVMCPackage
        The compiled model package object that will be run.
    device: str,
        the device (e.g. "cpu" or "cuda") to be targeted by the RPC
        session, local or remote).
    hostname : str, optional
        The hostname of the target device on which to run.
    port : int, optional
        The port of the target device on which to run.
    rpc_key : str, optional
        The tracker key of the target device. If this is set, it
        will be assumed that remote points to a tracker.
    inputs : dict, optional
        A dictionary that maps input names to numpy values. If not provided,
        inputs will be generated using the fill_mode argument.
    fill_mode : str, optional
        The fill-mode to use when generating data for input tensors.
        Valid options are "zeros", "ones" and "random".
        Defaults to "random".
    repeat : int, optional
        How many times to repeat the run.
    number : int, optional
        The number of runs to measure within each repeat.
    profile : bool
        Whether to profile the run with the debug runtime.

    Returns
    -------
    outputs : dict
        a dictionary with output tensors, generated by the module
    times : list of str
        execution times generated by the time evaluator
    """
    if not isinstance(tvmc_package, TVMCPackage):
        raise TVMCException(
            "This model doesn't seem to have been compiled yet. "
            "Try calling tvmc.compile on the model before running it.")

    # Currently only two package formats are supported: "classic" and
    # "mlf". The later can only be used for micro targets, i.e. with µTVM.
    if tvmc_package.type == "mlf":
        raise TVMCException(
            "You're trying to run a model saved using the Model Library Format (MLF)."
            "MLF can only be used to run micro targets (µTVM).")

    if hostname:
        if isinstance(port, str):
            port = int(port)
        # Remote RPC
        if rpc_key:
            logger.debug("Running on remote RPC tracker with key %s.", rpc_key)
            session = request_remote(rpc_key, hostname, port, timeout=1000)
        else:
            logger.debug("Running on remote RPC with no key.")
            session = rpc.connect(hostname, port)
    else:
        # Local
        logger.debug("Running a local session.")
        session = rpc.LocalSession()

    session.upload(tvmc_package.lib_path)
    lib = session.load_module(tvmc_package.lib_name)

    # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
    logger.debug("Device is %s.", device)
    if device == "cuda":
        dev = session.cuda()
    elif device == "cl":
        dev = session.cl()
    elif device == "metal":
        dev = session.metal()
    else:
        assert device == "cpu"
        dev = session.cpu()

    if profile:
        logger.debug("Creating runtime with profiling enabled.")
        module = debug_executor.create(tvmc_package.graph,
                                       lib,
                                       dev,
                                       dump_root="./prof")
    else:
        logger.debug("Creating runtime with profiling disabled.")
        module = runtime.create(tvmc_package.graph, lib, dev)

    logger.debug("Loading params into the runtime module.")
    module.load_params(tvmc_package.params)

    shape_dict, dtype_dict = get_input_info(tvmc_package.graph,
                                            tvmc_package.params)
    inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode)

    logger.debug("Setting inputs to the module.")
    module.set_input(**inputs_dict)

    # Run must be called explicitly if profiling
    if profile:
        logger.info("Running the module with profiling enabled.")
        module.run()

    # create the module time evaluator (returns a function)
    timer = module.module.time_evaluator("run",
                                         dev,
                                         number=number,
                                         repeat=repeat)
    # call the evaluator function to invoke the module and save execution times
    prof_result = timer()
    # collect a list of execution times from the profiling results
    times = prof_result.results

    logger.debug("Collecting the output tensors.")
    num_outputs = module.get_num_outputs()
    outputs = {}
    for i in range(num_outputs):
        output_name = "output_{}".format(i)
        outputs[output_name] = module.get_output(i).numpy()

    return TVMCResult(outputs, times)
Exemple #12
0
                                  disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog,
                    target=tvm.target.Target(target, host=env.target_host),
                    params=params,
                )

        # Export library
        temp = utils.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # If detailed runtime info is needed build with debug runtime
        if opt.debug_profile:
            m = debug_executor.create(graph, lib, ctx)
        else:
            m = graph_executor.create(graph, lib, ctx)

        # Set the network parameters and synthetic input
        image = tvm.nd.array(
            (np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
        m.set_input(**params)
        m.set_input("data", image)

        # Perform inference
        timer = m.module.time_evaluator("run",
                                        ctx,
                                        number=4,
                                        repeat=opt.measurements)
        tcost = timer()
Exemple #13
0
def run_module(
    module_file,
    device,
    hostname=None,
    port=9090,
    rpc_key=None,
    inputs=None,
    fill_mode="random",
    repeat=1,
    profile=False,
):
    """Run a compiled graph executor module locally or remotely with
    optional input values.

    If input tensors are not specified explicitly, they can be filled
    with zeroes, ones or random data.

    Parameters
    ----------
    module_file : str
        The path to the module file (a .tar file).
    device: str,
        the device (e.g. "cpu" or "gpu") to be targeted by the RPC
        session, local or remote).
    hostname : str, optional
        The hostname of the target device on which to run.
    port : int, optional
        The port of the target device on which to run.
    rpc_key : str, optional
        The tracker key of the target device. If this is set, it
        will be assumed that remote points to a tracker.
    inputs : dict, optional
        A dictionary that maps input names to numpy values.
    fill_mode : str, optional
        The fill-mode to use when generating data for input tensors.
        Valid options are "zeros", "ones" and "random".
        Defaults to "random".
    repeat : int, optional
        How many times to repeat the run.
    profile : bool
        Whether to profile the run with the debug runtime.

    Returns
    -------
    outputs : dict
        a dictionary with output tensors, generated by the module
    times : list of str
        execution times generated by the time evaluator
    """

    with tempfile.TemporaryDirectory() as tmp_dir:
        logger.debug("extracting module file %s", module_file)
        t = tarfile.open(module_file)
        t.extractall(tmp_dir)
        graph = open(os.path.join(tmp_dir, "mod.json")).read()
        params = bytearray(
            open(os.path.join(tmp_dir, "mod.params"), "rb").read())

        if hostname:
            # Remote RPC
            if rpc_key:
                logger.debug("running on remote RPC tracker with key %s",
                             rpc_key)
                session = request_remote(rpc_key, hostname, port, timeout=1000)
            else:
                logger.debug("running on remote RPC with no key")
                session = rpc.connect(hostname, port)
        else:
            # Local
            logger.debug("running a local session")
            session = rpc.LocalSession()

        session.upload(os.path.join(tmp_dir, "mod.so"))
        lib = session.load_module("mod.so")

        # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
        logger.debug("device is %s", device)
        if device == "gpu":
            dev = session.gpu()
        elif device == "cl":
            dev = session.cl()
        else:
            assert device == "cpu"
            dev = session.cpu()

        if profile:
            logger.debug("creating runtime with profiling enabled")
            module = debug_executor.create(graph, lib, dev, dump_root="./prof")
        else:
            logger.debug("creating runtime with profiling disabled")
            module = runtime.create(graph, lib, dev)

        logger.debug("load params into the runtime module")
        module.load_params(params)

        shape_dict, dtype_dict = get_input_info(graph, params)
        inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs,
                                       fill_mode)

        logger.debug("setting inputs to the module")
        module.set_input(**inputs_dict)

        # Run must be called explicitly if profiling
        if profile:
            logger.debug("running the module with profiling enabled")
            module.run()

        # create the module time evaluator (returns a function)
        timer = module.module.time_evaluator("run", dev, 1, repeat=repeat)
        # call the evaluator function to invoke the module and save execution times
        prof_result = timer()
        # collect a list of execution times from the profiling results
        times = prof_result.results

        logger.debug("collecting the output tensors")
        num_outputs = module.get_num_outputs()
        outputs = {}
        for i in range(num_outputs):
            output_name = "output_{}".format(i)
            outputs[output_name] = module.get_output(i).asnumpy()

        return outputs, times