Esempio n. 1
0
def test_canon_target_and_host_0():
    target = None
    host = None
    target, host = Target.canon_target_and_host(target, host)
    assert target is None
    assert host is None
Esempio n. 2
0
def schedule_hwnc_tensorcore_cuda(cfg, s, Conv):
    """Schedule tensorcore template"""
    packed_data, packed_kernel = s[Conv].op.input_tensors
    ic, kh, kw, ii = s[Conv].op.reduce_axis
    pad_data = s[packed_data].op.input_tensors[0]

    block_x = te.thread_axis('blockIdx.x')
    block_y = te.thread_axis('blockIdx.y')
    block_z = te.thread_axis('blockIdx.z')
    thread_x = te.thread_axis('threadIdx.x')
    thread_y = te.thread_axis('threadIdx.y')
    thread_z = te.thread_axis('threadIdx.z')

    # Designate the memory hierarchy
    AS = s.cache_read(packed_data, 'shared', [Conv])
    WS = s.cache_read(packed_kernel, 'shared', [Conv])
    AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
    WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
    ConvF = s.cache_write(Conv, 'wmma.accumulator')

    if Conv.op in s.outputs:
        output = Conv
        ConvS = s.cache_read(ConvF, 'shared', [Conv])
        OL = ConvS
    else:
        output = s.outputs[0].output(0)
        s[Conv].set_scope('shared')
        OL = Conv

    out_dtype = Conv.dtype

    if isinstance(packed_kernel.op, te.tensor.ComputeOp) and packed_kernel.name == "packed_kernel":
        if autotvm.GLOBAL_SCOPE.in_tuning:
            s[packed_kernel].pragma(
                s[packed_kernel].op.axis[0], "debug_skip_region")
        else:
            with Target('cuda'):
                schedule_injective_from_existing(s, packed_kernel)

    if isinstance(pad_data.op, te.tensor.ComputeOp) and "pad" in pad_data.op.tag:
        s[pad_data].compute_inline()
        data = pad_data.op.input_tensors[0]

        if autotvm.GLOBAL_SCOPE.in_tuning:
            # skip this part during tuning to make recrods accurate
            # this part will be pre-computed during NNVM's pre-compute optimization pass
            s[pad_data].pragma(s[pad_data].op.axis[0], "debug_skip_region")
    else:
        data = pad_data
        s[data].compute_inline()

    data_dtype = data.dtype
    kernel_dtype = packed_kernel.dtype

    # Schedule for autotvm
    cfg.define_knob("block_row_warps", [1, 2, 4])
    cfg.define_knob("block_col_warps", [1, 2, 4])
    cfg.define_knob("warp_row_tiles", [1, 2, 4, 8, 16])
    cfg.define_knob("warp_col_tiles", [1, 2, 4, 8, 16])
    cfg.define_knob("chunk", [1, 2, 4, 8])
    cfg.define_knob("fuse_pack", [0, 1])
    cfg.define_knob("split_block_k_nums", [1, 2, 4, 8, 16, 32])
    cfg.define_knob("vector_ws", [1, 8])
    cfg.define_knob("vector_as", [1, 8, 16])

    block_row_warps = cfg["block_row_warps"].val
    block_col_warps = cfg["block_col_warps"].val
    warp_row_tiles = cfg["warp_row_tiles"].val
    warp_col_tiles = cfg["warp_col_tiles"].val
    chunk = cfg["chunk"].val
    vector_as = cfg["vector_as"].val
    vector_ws = cfg["vector_ws"].val
    split_block_k_nums = cfg["split_block_k_nums"].val
    fuse_pack = cfg["fuse_pack"].val

    if not fuse_pack:
        s[packed_data].compute_inline()
    else:
        with Target('cuda'):
            schedule_injective_from_existing(s, packed_data)

    if data_dtype in ['int4', 'uint4']:
        wmma_m = wmma_n = 8
        wmma_k = 32
    else:
        wmma_m = 8
        wmma_n = 32
        wmma_k = 16

    warp_size = 32

    # Schedule for output
    if len(s[output].op.axis) == 4:
        hc, wc, nc, oc, = output.op.axis
        nc, nnc = s[output].split(nc, factor=wmma_m)
        oc, ooc = s[output].split(oc, factor=wmma_n)
    else:
        hc, wc, nc, oc, nnc, ooc = output.op.axis

    kernel_scope, hc = s[output].split(hc, nparts=1)

    block_k = s[output].fuse(hc, wc)
    block_k, split_block_k = s[output].split(
        block_k, factor=split_block_k_nums)
    nc, nci = s[output].split(nc, factor=warp_row_tiles)
    block_i, nc = s[output].split(nc, factor=block_row_warps)
    oc, oci = s[output].split(oc, factor=warp_col_tiles)
    block_j, oc = s[output].split(oc, factor=block_col_warps)
    s[output].reorder(block_k, split_block_k, block_i,
                      block_j, nc, oc, nci, oci, nnc, ooc)
    t = s[output].fuse(nnc, ooc)
    _, tx = s[output].split(t, factor=warp_size)
    s[output].bind(block_k, block_z)
    s[output].bind(block_i, block_x)
    s[output].bind(block_j, block_y)
    s[output].bind(tx, thread_x)
    s[output].bind(nc, thread_y)
    s[output].bind(oc, thread_z)

    # Schedule wmma store
    s[OL].compute_at(s[output], block_j)
    hc, wc, nc, oc, nnc, ooc = OL.op.axis
    oc, oci = s[OL].split(oc, factor=warp_col_tiles)
    _, oc = s[OL].split(oc, factor=block_col_warps)
    nc, nci = s[OL].split(nc, factor=warp_row_tiles)
    _, nc = s[OL].split(nc, factor=block_row_warps)
    s[OL].reorder(nc, oc, nci, oci, nnc, ooc)
    s[OL].bind(nc, thread_y)
    s[OL].bind(oc, thread_z)

    # Schedule local computation
    s[ConvF].compute_at(s[OL], oc)
    _, _, n, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    cfg.define_reorder("reorder_inner", [ko, kh], policy="all")
    cfg["reorder_inner"].apply(s, ConvF, [ko, kh])
    cfg["reorder_inner"].apply(s, ConvF, [ki, kw])

    cfg.define_knob("compute_at_AS", [0, 1, 2, 3])
    cfg.define_knob("compute_at_WS", [0, 1, 2, 3])
    compute_at_AS = cfg["compute_at_AS"].val
    compute_at_WS = cfg["compute_at_WS"].val

    # Move intermediate computation into each output compute tile
    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    # Schedule for A's share memory
    if compute_at_AS == 0:
        s[AS].compute_at(s[ConvF], ki)
    elif compute_at_AS == 1:
        s[AS].compute_at(s[ConvF], kw)
    elif compute_at_AS == 2:
        s[AS].compute_at(s[ConvF], ko)
    else:
        s[AS].compute_at(s[ConvF], kh)
    _, _, n, _, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, _ = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, nparts=warp_size)
    ti, _t = s[AS].split(ti, factor=vector_as)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(to, thread_x)
    s[AS].vectorize(_t)

    # Schedule for W's share memory
    if compute_at_WS == 0:
        s[WS].compute_at(s[ConvF], ki)
    elif compute_at_WS == 1:
        s[WS].compute_at(s[ConvF], kw)
    elif compute_at_WS == 2:
        s[WS].compute_at(s[ConvF], ko)
    else:
        s[WS].compute_at(s[ConvF], kh)
    s[WS].compute_at(s[ConvF], kw)
    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, _ = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    ti, _t = s[WS].split(ti, factor=vector_ws)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    # double buffer
    cfg.define_knob('AS_double_buffer', [0, 1])
    cfg.define_knob('WS_double_buffer', [0, 1])
    if cfg['AS_double_buffer'].val:
        s[AS].double_buffer()
    if cfg['WS_double_buffer'].val:
        s[WS].double_buffer()

    # unroll
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                     cfg['auto_unroll_max_step'].val)
    s[output].pragma(kernel_scope, 'unroll_explicit', False)

    shape = (wmma_m, wmma_n, wmma_k)

    AS_shape = (wmma_m, wmma_k)
    AL_shape = (wmma_m, wmma_k)
    WS_shape = (wmma_n, wmma_k)
    WL_shape = (wmma_n, wmma_k)
    CL_shape = (wmma_m, wmma_n)
    CS_shape = (wmma_m, wmma_n)

    AL_gemm = te.placeholder(AL_shape, name='A', dtype=data_dtype)
    WL_gemm = te.placeholder(WL_shape, name='B', dtype=kernel_dtype)
    k_gemm = te.reduce_axis((0, wmma_k), name="k")
    CL_compute = te.compute(CL_shape, lambda ii, jj:
                            te.sum((AL_gemm[ii, k_gemm].astype(
                                'int32') * WL_gemm[jj, k_gemm].astype('int32')), axis=k_gemm),
                            name='C')

    AL_strides = [wmma_k, 1]
    AS_strides = [wmma_k, 1]
    WL_strides = [wmma_k, 1]
    WS_strides = [wmma_k, 1]
    CL_strides = [wmma_n, 1]
    CS_strides = [wmma_n, 1]

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix_A(AL_strides, AS_strides, shape,
                                              "row_major", AS_shape, AL_shape, data_dtype))

    s[WF].tensorize(WF.op.axis[-2],
                    intrin_wmma_load_matrix_W(WL_strides, WS_strides, shape,
                                              "col_major", WS_shape, WL_shape, kernel_dtype))

    s[OL].tensorize(nnc, intrin_wmma_store_matrix(CS_strides, CL_strides,
                                                  shape, out_dtype, CL_shape, CS_shape))

    s[ConvF].tensorize(nnf, intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides,
                                             WL_strides, CL_strides, shape))

    return s
Esempio n. 3
0
def build(
    ir_mod,
    target=None,
    target_host=None,
    executor=Executor("graph"),
    runtime=Runtime("cpp"),
    workspace_memory_pools=None,
    params=None,
    mod_name="default",
):
    # fmt: off
    # pylint: disable=line-too-long
    """Helper function that builds a Relay function to run on TVM graph executor.

    Parameters
    ----------
    ir_mod : :py:class:`~tvm.IRModule`
        The IR module to build. Using relay.Function is deprecated.

    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional
        For heterogeneous compilation, it is a dictionary indicating context to
        target mapping. For homogeneous compilation, it is a build target.

    target_host : str or :any:`tvm.target.Target`, optional
        Host compilation target, if target is device.
        When TVM compiles device specific program such as CUDA,
        we also need host(CPU) side code to interact with the driver
        setup the dimensions and parameters correctly.
        target_host is used to specify the host side codegen target.
        By default, llvm is used if it is enabled,
        otherwise a stackvm interpreter is used.

    executor : Optional[Executor]
        The executor configuration with which to build the model.
        Defaults to "graph" if no executor specified.

    runtime : Optional[Runtime]
        Runtime configuration to use when building the model.
        Defaults to "cpp" if no runtime specified.

    workspace_memory_pools : Optional[WorkspaceMemoryPools]
        The object that contains an Array of PoolInfo objects
        that hold properties of workspace pools that could be
        used by the inference.

    params : dict of str to NDArray
        Input parameters to the graph that do not change
        during inference time. Used for constant folding.

    mod_name: Optional[str]
        The module name we will build

    Returns
    -------
    factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
            The runtime factory for the TVM graph executor.
    """
    # pylint: enable=line-too-long
    # fmt: on

    if not isinstance(ir_mod, (IRModule, _function.Function)):
        raise ValueError("Type of input parameter mod must be tvm.IRModule")

    if isinstance(ir_mod, _function.Function):
        if params:
            ir_mod = bind_params_by_name(ir_mod, params)
        ir_mod = IRModule.from_expr(ir_mod)
        warnings.warn(
            "Please use input parameter mod (tvm.IRModule) "
            "instead of deprecated parameter mod (tvm.relay.function.Function)",
            DeprecationWarning,
        )

    if target_host is not None:
        warnings.warn(
            "target_host parameter is going to be deprecated. "
            "Please pass in tvm.target.Target(target, host=target_host) instead."
        )

    target, target_host = Target.check_and_update_host_consist(
        target, target_host, target_is_dict_key=False
    )

    target = build_target_by_device_type_map(target)
    if isinstance(target_host, (str, Target)):
        target_host = Target(target_host)
    elif target_host:
        raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")

    # All of this logic is to raise deprecation warnings for various parameters
    # TODO(Mousius) Remove these after some time
    deprecated_params_target = target_host or list(target.values())[0]
    deprecated_executor, deprecated_runtime = _reconstruct_from_deprecated_options(
        deprecated_params_target
    )
    if deprecated_executor:
        executor = deprecated_executor
    if deprecated_runtime:
        runtime = deprecated_runtime

    # If current dispatch context is fallback context (the default root context),
    # then load pre-tuned parameters from TopHub
    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
        tophub_context = autotvm.tophub.context(list(target.values()))
    else:
        tophub_context = autotvm.utils.EmptyContext()

    with tophub_context:
        bld_mod = BuildModule()
        graph_json, runtime_mod, params = bld_mod.build(
            mod=ir_mod,
            target=target,
            params=params,
            executor=executor,
            runtime=runtime,
            workspace_memory_pools=workspace_memory_pools,
            mod_name=mod_name,
        )
        func_metadata = bld_mod.get_function_metadata()
        devices = bld_mod.get_devices()
        lowered_ir_mods = bld_mod.get_irmodule()
        executor_codegen_metadata = bld_mod.get_executor_codegen_metadata()

        if str(executor) == "aot":
            executor_factory = _executor_factory.AOTExecutorFactoryModule(
                ir_mod,
                lowered_ir_mods,
                target,
                executor,
                runtime,
                runtime_mod,
                mod_name,
                params,
                func_metadata,
                executor_codegen_metadata,
                devices,
            )
        elif str(executor) == "graph":
            executor_factory = _executor_factory.GraphExecutorFactoryModule(
                ir_mod, target, executor, graph_json, runtime_mod, mod_name, params, func_metadata
            )
        else:
            assert False, "Executor " + executor + " not supported"

        return executor_factory
Esempio n. 4
0
def extract_from_multiple_program(mods, params, target, target_host=None, ops=None):
    """Extract tuning tasks from multiple relay programs.

    This function collects tuning tasks by building a list of programs
    with a "tracing" target and tracing all the calls to topi.

    Parameters
    ----------
    mods: List[tvm.IRModule] or List[relay.function.Function]
        The list of modules or functions to tune
    params: List of dict of str to numpy array
        The associated parameters of the programs
    target: tvm.target.Target
        The compilation target
    target_host: tvm.target.Target
        The host compilation target
    ops: List[tvm.ir.Op] or None
        List of relay ops to be tuned.  If not specified, all tunable ops will be extracted.

    Returns
    -------
    task: Array of autotvm.task.Task
        collected tasks
    """
    # pylint: disable=import-outside-toplevel
    from tvm import relay
    from tvm import topi

    env = TaskExtractEnv.get()

    # merge target and target host
    target, target_host = Target.check_and_update_host_consist(target, target_host)

    # run compiler to collect all TOPI calls during compilation
    env.reset(ops)
    with env:
        # disable logger temporarily
        old_state = logger.disabled
        logger.disabled = True

        for mod, param in zip(mods, params):
            if isinstance(mod, relay.function.Function):
                mod = tvm.IRModule.from_expr(mod)
            assert isinstance(
                mod, tvm.IRModule
            ), "only support relay Module or Function to be tuned"
            relay.backend.te_compiler.get().clear()
            # wrap build call in thread to avoid multiprocessing problems
            build_thread = threading.Thread(target=_lower, args=(mod, target, param))
            build_thread.start()
            build_thread.join()
            relay.backend.te_compiler.get().clear()
            # Clear the warning message cache in FallbackContext
            if isinstance(DispatchContext.current, FallbackContext):
                DispatchContext.current.memory = {}
                DispatchContext.warning_messages = set()

        logger.disabled = old_state

    # create tasks for target
    tasks = []
    for task_name, args in env.get_tasks():
        try:
            tsk = create(task_name, args, target=target)
            tasks.append(tsk)
        except topi.InvalidShapeError:
            logger.warning("Invalid shape during AutoTVM task creation")

    return tasks
def test_check_and_update_host_consist_2():
    target = Target("cuda")
    host = Target("llvm")
    target, host = Target.check_and_update_host_consist(target, host)
    assert target.kind.name == "cuda"
    assert target.host.kind.name == "llvm"
Esempio n. 6
0
def test_resnet_subgraph(algorithm, workspace_size):
    target = Target("c")
    global_workspace_pool = usmp_utils.PoolInfo(
        pool_name="global_workspace",
        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
    )
    tir_mod = ResnetStructure
    tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
    tir_mod = _assign_poolinfos_to_allocates_in_irmodule(
        tir_mod, [global_workspace_pool])
    main_func = tir_mod["tvmgen_default_run_model"]
    buffer_info_analysis = tvm.tir.usmp.analysis.extract_buffer_info(
        main_func, tir_mod)
    assert buffer_info_analysis.memory_pressure == 7200256

    fcreate_array_bi = tvm.get_global_func("tir.usmp.CreateArrayBufferInfo")
    buffer_info_arr = fcreate_array_bi(buffer_info_analysis.buffer_info_stmts)
    fusmp_algo = tvm.get_global_func(f"tir.usmp.algo.{algorithm}")
    buffer_pool_allocations = fusmp_algo(buffer_info_arr,
                                         buffer_info_analysis.memory_pressure)

    buffer_info_map_names = dict()
    for buf_info in buffer_info_arr:
        buffer_info_map_names[buf_info.name_hint] = buf_info

    # check conflicts
    _verify_conflicts(
        "sid_7",
        [
            "PaddedInput_1",
            "sid_2",
            "Conv2dOutput_1",
            "PaddedInput_2",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "Conv2dOutput_3",
        [
            "PaddedInput_3",
            "sid_6",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "sid_6",
        [
            "Conv2dOutput_2",
            "PaddedInput_2",
            "sid_2",
            "PaddedInput_3",
            "Conv2dOutput_3",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "Conv2dOutput",
        [
            "sid_8",
            "sid_2",
            "PaddedInput",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "PaddedInput_3",
        [
            "sid_6",
            "sid_2",
            "Conv2dOutput_3",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "Conv2dOutput_2",
        [
            "PaddedInput_2",
            "sid_2",
            "sid_6",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "PaddedInput_1",
        [
            "sid_8",
            "sid_2",
            "sid_7",
            "Conv2dOutput_1",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "Conv2dOutput_1",
        [
            "sid_7",
            "PaddedInput_1",
            "sid_2",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "PaddedInput",
        [
            "sid_2",
            "sid_8",
            "Conv2dOutput",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "sid_8",
        [
            "PaddedInput",
            "sid_2",
            "Conv2dOutput",
            "PaddedInput_1",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "sid_2",
        [
            "PaddedInput",
            "sid_8",
            "Conv2dOutput",
            "PaddedInput_1",
            "sid_7",
            "Conv2dOutput_1",
            "PaddedInput_2",
            "Conv2dOutput_2",
            "sid_6",
            "PaddedInput_3",
        ],
        buffer_info_map_names,
    )
    _verify_conflicts(
        "PaddedInput_2",
        [
            "sid_7",
            "sid_2",
            "Conv2dOutput_2",
            "sid_6",
        ],
        buffer_info_map_names,
    )

    _check_max_workspace_size(buffer_pool_allocations, global_workspace_pool,
                              workspace_size)
Esempio n. 7
0
def _target() -> Target:
    return Target("cuda --max_threads_per_block=1024", host="llvm")
Esempio n. 8
0
def test_canon_multi_target_and_host_3():
    raw_targets = Target.canon_multi_target_and_host(["llvm", "cuda"])
    assert len(raw_targets) == 2
    assert raw_targets[0].kind.name == "llvm"
    assert raw_targets[1].kind.name == "cuda"
Esempio n. 9
0
def test_canon_multi_target_and_host_5():
    raw_targets = Target.canon_multi_target_and_host("cuda", "llvm")
    assert len(raw_targets) == 1
    assert raw_targets[0].kind.name == "cuda"
    assert raw_targets[0].host.kind.name == "llvm"
Esempio n. 10
0
def test_canon_multi_target_and_host_1():
    raw_targets = Target.canon_multi_target_and_host({"kind": "llvm"})
    assert len(raw_targets) == 1
    assert raw_targets[0].kind.name == "llvm"
Esempio n. 11
0
def test_canon_multi_target_and_host_2():
    raw_targets = Target.canon_multi_target_and_host({1: "llvm", 2: "cuda"})
    assert len(raw_targets) == 2
    assert raw_targets[0].kind.name == "llvm"
    assert raw_targets[1].kind.name == "cuda"
Esempio n. 12
0
def test_canon_multi_target_and_host_0():
    with pytest.raises(AssertionError):
        Target.canon_multi_target_and_host(None)
Esempio n. 13
0
def test_canon_target_and_host_2():
    target = Target("cuda")
    host = Target("llvm")
    target, host = Target.canon_target_and_host(target, host)
    assert target.kind.name == "cuda"
    assert target.host.kind.name == "llvm"
Esempio n. 14
0
def test_canon_target_and_host_1():
    target = None
    host = "llvm"
    with pytest.raises(AssertionError, match=r"Target host is not empty when target is empty."):
        target, host = Target.canon_target_and_host(target, host)
def _target() -> Target:
    return Target("cuda", host="llvm")
Esempio n. 16
0
 def _target(target: Union[str, Target]) -> Target:
     if isinstance(target, str):
         target = Target(target)
     if not isinstance(target, Target):
         raise TypeError(f"Expected `target` to be str or Target, but gets: {target}")
     return target
Esempio n. 17
0
def test_fanout(algorithm, workspace_size):
    """
    The test case here represent BufferInfo objects
    that could get generated for a fanout topology
    such as :
    (Op A)
    |
    bi_a ---------
    |            |
    (Op B)     (Op C)
    |            |
    bi_b        bi_c
    |            |
    (Op D)     (Op E)
    |            |
    bi_d        bi_e
    |            |
    (Op F) ------
    |
    bi_f
    |
    (Op G)
    |
    bi_g
    """
    target = Target("c")
    global_workspace_pool = usmp_utils.PoolInfo(
        pool_name="global_workspace",
        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
    )
    bi_a = usmp_utils.BufferInfo(name_hint="bi_a",
                                 size_bytes=10,
                                 pool_candidates=[global_workspace_pool])
    bi_b = usmp_utils.BufferInfo(name_hint="bi_b",
                                 size_bytes=20,
                                 pool_candidates=[global_workspace_pool])
    bi_c = usmp_utils.BufferInfo(name_hint="bi_c",
                                 size_bytes=100,
                                 pool_candidates=[global_workspace_pool])
    bi_d = usmp_utils.BufferInfo(name_hint="bi_d",
                                 size_bytes=40,
                                 pool_candidates=[global_workspace_pool])
    bi_e = usmp_utils.BufferInfo(name_hint="bi_e",
                                 size_bytes=50,
                                 pool_candidates=[global_workspace_pool])
    bi_f = usmp_utils.BufferInfo(name_hint="bi_f",
                                 size_bytes=60,
                                 pool_candidates=[global_workspace_pool])
    bi_g = usmp_utils.BufferInfo(name_hint="bi_g",
                                 size_bytes=70,
                                 pool_candidates=[global_workspace_pool])

    # Creating conflicts for a linear graph
    bi_a.set_conflicts([bi_b, bi_c])
    bi_b.set_conflicts([bi_a, bi_c, bi_e])
    bi_c.set_conflicts([bi_e, bi_a, bi_b, bi_d])
    bi_d.set_conflicts([bi_b, bi_f, bi_c, bi_e])
    bi_e.set_conflicts([bi_c, bi_f, bi_b, bi_d])
    bi_f.set_conflicts([bi_d, bi_e, bi_f])
    bi_g.set_conflicts([bi_f])

    buffer_info_arr = [bi_a, bi_b, bi_c, bi_d, bi_e, bi_f, bi_g]
    fusmp_algo = tvm.get_global_func(f"tir.usmp.algo.{algorithm}")
    buffer_pool_allocations = fusmp_algo(buffer_info_arr, 0)
    _check_max_workspace_size(buffer_pool_allocations, global_workspace_pool,
                              workspace_size)
Esempio n. 18
0
def create_executor(kind="debug",
                    mod=None,
                    device=None,
                    target="llvm",
                    params=None):
    """Factory function to create an executor.

    Example
    -------
    .. code-block:: python

        import tvm.relay
        import numpy as np

        x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
        expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
        tvm.relay.create_executor(
            kind="vm", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr))
        ).evaluate()(np.array([2], dtype="float32"))
        # returns `array([3.], dtype=float32)`

    Parameters
    ----------
    kind : str
        The type of executor. Avaliable options are `debug` for the
        interpreter, `graph` for the graph executor, and `vm` for the virtual
        machine.

    mod : :py:class:`~tvm.IRModule`
        The Relay module containing collection of functions

    device : :py:class:`Device`
        The device to execute the code.

    target : :py:class:`tvm.Target`
        The corresponding context

    params : dict of str to NDArray
         Input parameters to the graph that do not change
         during inference time.

    Returns
    -------
    executor : :py:class:`~tvm.relay.backend.interpreter.Executor`
    """
    if mod is None:
        mod = IRModule()
    if device is not None:
        assert device.device_type == _nd.device(str(target), 0).device_type
    else:
        device = _nd.device(str(target), 0)

    if params is not None:
        mod = IRModule.from_expr(bind_params_by_name(mod["main"], params))

    if isinstance(target, str):
        target = Target(target)
    if kind == "debug":
        return _interpreter.Interpreter(mod, device, target)
    if kind == "graph":
        return GraphExecutor(mod, device, target)
    if kind == "vm":
        return VMExecutor(mod, device, target)
    raise RuntimeError("unknown execution strategy: {0}".format(kind))
Esempio n. 19
0
def compile_model(
    tvmc_model: TVMCModel,
    target: str,
    tuning_records: Optional[str] = None,
    package_path: Optional[str] = None,
    cross: Optional[Union[str, Callable]] = None,
    cross_options: Optional[str] = None,
    export_format: str = "so",
    dump_code: Optional[List[str]] = None,
    target_host: Optional[str] = None,
    desired_layout: Optional[str] = None,
    disabled_pass: Optional[str] = None,
):
    """Compile a model from a supported framework into a TVM module.

    This function takes a union of the arguments of both frontends.load_model
    and compiler.compile_relay. The resulting TVM module can be executed using
    the graph executor.

    Parameters
    ----------
    tvmc_model : TVMCModel
        The model object that should be compiled.
    target : str
        The target for which to compile. Can be a plain string or
        a path.
    tuning_records : str
        A path to tuning records produced using tvmc.tune. When provided,
        compilation will use more optimized kernels leading to better results.
    package_path : str, optional
        The path to export the compiled model to. If not provided it will
        be saved in a temporary directory.
    cross : str or callable object, optional
        Function that performs the actual compilation
    cross_options : str, optional
        Command line options to be passed to the cross compiler.
    export_format : str
        What format to use when saving the function library. Must be one of "so" or "tar".
        When compiling for a remote device without a cross compiler, "tar" will likely work better.
    dump_code : list, optional
        Dump the generated code for the specified source types, on
        the requested target.
    target_host : str, optional
        The target of the host machine if host-side code
        needs to be generated.
    desired_layout: str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.
    disabled_pass: str, optional
        Comma-separated list of passes which needs to be disabled
        during compilation


    Returns
    -------
    compiled_model : TVMCPackage
        The compiled TVMCModel ready to be run.

    """
    mod, params = tvmc_model.mod, tvmc_model.params

    config = {}

    if desired_layout:
        mod = common.convert_graph_layout(mod, desired_layout)

    tvm_target, extra_targets = common.target_from_cli(target)
    tvm_target, target_host = Target.check_and_update_host_consist(
        tvm_target, target_host)

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]
        mod = partition_function(mod, params, **codegen_from_cli["opts"])
        if codegen["config_key"] is not None:
            config[codegen["config_key"]] = codegen_from_cli["opts"]

    if tuning_records and os.path.exists(tuning_records):
        logger.debug("tuning records file provided: %s", tuning_records)

        use_autoscheduler = True
        try:
            auto_scheduler.load_records(tuning_records)
        except tvm._ffi.base.TVMError:
            use_autoscheduler = False

        if use_autoscheduler:
            with auto_scheduler.ApplyHistoryBest(tuning_records):
                config["relay.backend.use_auto_scheduler"] = True
                with tvm.transform.PassContext(opt_level=3,
                                               config=config,
                                               disabled_pass=disabled_pass):
                    logger.debug("building relay graph with autoscheduler")
                    graph_module = relay.build(mod,
                                               target=tvm_target,
                                               params=params)
        else:
            with autotvm.apply_history_best(tuning_records):
                with tvm.transform.PassContext(opt_level=3,
                                               config=config,
                                               disabled_pass=disabled_pass):
                    logger.debug("building relay graph with tuning records")
                    graph_module = relay.build(mod,
                                               target=tvm_target,
                                               params=params)
    else:
        with tvm.transform.PassContext(opt_level=3,
                                       config=config,
                                       disabled_pass=disabled_pass):
            logger.debug("building relay graph (no tuning records provided)")
            graph_module = relay.build(mod, target=tvm_target, params=params)

    # Generate output dump files with sources
    if dump_code is None:
        dump_code = []
    if not isinstance(dump_code, list):
        dump_code = [dump_code]
    dumps = {}
    for source_type in dump_code:
        lib = graph_module.get_lib()
        # TODO lib.get_source call have inconsistent behavior for unsupported
        #      formats (@leandron).
        source = str(mod) if source_type == "relay" else lib.get_source(
            source_type)
        dumps[source_type] = source

    # Create a new tvmc model package object from the graph definition.
    package_path = tvmc_model.export_package(graph_module, package_path, cross,
                                             cross_options, export_format)

    # Write dumps to file.
    if dumps:
        save_dumps(package_path, dumps)

    return TVMCPackage(package_path)
Esempio n. 20
0
    def build(
        self, mod, target=None, target_host=None, params=None, executor="graph", mod_name=None
    ):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        executor: str[Optional]
            The type of executor to be used in order to run the model:
            - If "graph" is specified, then the graph_executor will be used
            - If "aot" is specified, then the aot_executor will be used

        mod_name: Optional[str]
            The module name we will build

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph executor.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        target = build_target_by_device_type_map(target)
        target, target_host = Target.check_and_update_host_consist(
            target, target_host, target_is_dict_key=False
        )

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False
        )

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler

        mod_name = mangle_module_name(mod_name)

        self._build(mod, target, target_host, executor, mod_name)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        mod = self.get_module()
        params = self.get_params()
        executor_config = self.get_graph_json() if executor == "graph" else None

        return executor_config, mod, params
Esempio n. 21
0
def enabled():
    return "cmsis-nn" in Target.list_kinds()
Esempio n. 22
0
def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
    # fmt: off
    # pylint: disable=line-too-long
    """Helper function that builds a Relay function to run on TVM graph executor.

    Parameters
    ----------
    ir_mod : :py:class:`~tvm.IRModule`
        The IR module to build. Using relay.Function is deprecated.

    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional
        For heterogeneous compilation, it is a dictionary indicating context to
        target mapping. For homogeneous compilation, it is a build target.

    target_host : str or :any:`tvm.target.Target`, optional
        Host compilation target, if target is device.
        When TVM compiles device specific program such as CUDA,
        we also need host(CPU) side code to interact with the driver
        setup the dimensions and parameters correctly.
        target_host is used to specify the host side codegen target.
        By default, llvm is used if it is enabled,
        otherwise a stackvm intepreter is used.

    params : dict of str to NDArray
        Input parameters to the graph that do not change
        during inference time. Used for constant folding.

    mod_name: Optional[str]
        The module name we will build

    Returns
    -------
    factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
            The runtime factory for the TVM graph executor.
    """
    # pylint: enable=line-too-long
    # fmt: on

    if not isinstance(ir_mod, (IRModule, _function.Function)):
        raise ValueError("Type of input parameter mod must be tvm.IRModule")

    if isinstance(ir_mod, _function.Function):
        if params:
            ir_mod = bind_params_by_name(ir_mod, params)
        ir_mod = IRModule.from_expr(ir_mod)
        warnings.warn(
            "Please use input parameter mod (tvm.IRModule) "
            "instead of deprecated parameter mod (tvm.relay.function.Function)",
            DeprecationWarning,
        )
    target = build_target_by_device_type_map(target)
    if isinstance(target_host, (str, Target)):
        target_host = Target(target_host)
    elif target_host:
        raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")

    target, target_host = Target.check_and_update_host_consist(
        target, target_host, target_is_dict_key=False
    )

    # Retrieve the executor from the target
    executor = get_executor_from_target(target, target_host)

    # If current dispatch context is fallback context (the default root context),
    # then load pre-tuned parameters from TopHub
    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
        tophub_context = autotvm.tophub.context(list(target.values()))
    else:
        tophub_context = autotvm.utils.EmptyContext()

    with tophub_context:
        bld_mod = BuildModule()
        executor_config, runtime_mod, params = bld_mod.build(
            mod=ir_mod, target=target, params=params, executor=executor, mod_name=mod_name
        )
        func_metadata = bld_mod.get_function_metadata()

        if executor == "aot":
            executor_factory = _executor_factory.AOTExecutorFactoryModule(
                ir_mod, target, runtime_mod, mod_name, params, func_metadata
            )
        elif executor == "graph":
            executor_factory = _executor_factory.GraphExecutorFactoryModule(
                ir_mod, target, executor_config, runtime_mod, mod_name, params, func_metadata
            )
        else:
            assert False, "Executor " + executor + " not supported"

        return executor_factory
Esempio n. 23
0
def test_check_and_update_host_consist_0():
    target = None
    host = None
    target, host = Target.check_and_update_host_consist(target, host)
def _target() -> Target:
    return Target("nvidia/geforce-rtx-3080")
Esempio n. 25
0
def tune_model(
    tvmc_model: TVMCModel,
    target: str,
    tuning_records: Optional[str] = None,
    prior_records: Optional[str] = None,
    enable_autoscheduler: bool = False,
    rpc_key: Optional[str] = None,
    hostname: Optional[str] = None,
    port: Optional[Union[int, str]] = 9090,
    trials: int = 10000,
    target_host: Optional[str] = None,
    tuner: str = "xgb",
    min_repeat_ms: Optional[int] = None,
    early_stopping: Optional[int] = None,
    desired_layout: Optional[str] = None,
    timeout: int = 10,
    repeat: int = 1,
    number: int = 10,
    parallel: int = 4,
    hardware_params: Optional[HardwareParams] = None,
    include_simple_tasks: bool = False,
    log_estimated_latency: bool = False,
    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
):
    """Use tuning to automatically optimize the functions in a model.

    Parameters
    ----------
    tvmc_model : TVMCModel
        The model to be optimized.
    target : str
        Compilation target as plain string, inline JSON or path to a JSON file.
    tuning_records: str, optional
        The path to a file that tuning results will be saved to. If not specified,
        a temporary file will be used.
    prior_records: str, optional
        A path to previous tuning results that will be used to hot-start the tuning
        cost model if provided.
    enable_autoscheduler : bool, optional
        When true, use autoscheduling rather than autotvm. This should produce
        faster kernels for compatible model-target pairs.
    rpc_key : str, optional
        The RPC tracker key of the target device. Required when rpc_tracker is provided.
    hostname : str, optional
        The IP address of an RPC tracker, used when benchmarking remotely.
    port : int or str, optional
        The port of the RPC tracker to connect to. Defaults to 9090.
    trials : int, optional
        The number of schedules to try out for the entire model. Note that the default
        value is chosen as a decent average for most models, but larger models may need
        more trials to reach a good result while smaller models will converge with fewer
        trials.
    tuner : str, optional
        The type of tuner to use when tuning with autotvm. Can be one of
        "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank".
    min_repeat_ms : int, optional
        Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets.
    early_stopping : int, optional
        When specified, stop tuning after this number of trials if results aren't improving.
    desired_layout : str, optional
        Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph
        will have their layout set to this format. Tasks will then be tuned using this
        specified layout.
    timeout : int, optional,
        If a kernel trial lasts longer than this duration in seconds, it will be
        considered a failure.
    repeat : int, optional
        How many times each measurement should be repeated.
    number : int, optional
        The number of runs a single repeat is made of.
    parallel : int, optional
        The maximum number of parallel devices to use when tuning.
    hardware_params : auto_scheduler.HardwareParams, optional
        When using the autoscheduler, this object defines the configuration of the target hardware.
    include_simple_tasks : bool, optional
        Whether to extract simple operations or only computationally intensive ones when using
        the autoscheduler.
    log_estimated_latency : bool, optional
        If using the autoscheduler, write the estimated latency at each step of tuning to file.
    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
        Additional target options in a dictionary to combine with initial Target arguments

    Returns
    -------
    tuning_records : str
        The path to the produced tuning log file.
    """
    target, extra_targets = target_from_cli(target, additional_target_options)
    target, target_host = Target.check_and_update_host_consist(
        target, target_host)
    # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source
    # model is fixed. For now, creating a clone avoids the issue.
    mod = deepcopy(tvmc_model.mod)
    params = tvmc_model.params
    if tuning_records is None:
        tuning_records = tvmc_model.default_tuning_records_path()

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]
        mod = partition_function(mod, params, **codegen_from_cli["opts"])

    # min_repeat_ms should be:
    # a. the value provided by the user, if any, or
    # b. 0ms in case target is "cpu"; otherwise 1000ms
    if min_repeat_ms is None:
        min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
        logger.info("Default --min-repeat-ms for this target is %s",
                    min_repeat_ms)

    if rpc_key:
        if hostname is None or port is None:
            raise TVMCException(
                "You must provide a hostname and port to connect to a remote RPC device."
            )
        if isinstance(port, str):
            port = int(port)

        logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key,
                    hostname, port)

        runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner
        runner = runner_ctor(
            key=rpc_key,
            host=hostname,
            port=port,
            number=number,
            repeat=repeat,
            n_parallel=parallel,
            timeout=timeout,
            min_repeat_ms=min_repeat_ms,
        )
    else:
        logger.info("Starting localhost tuning.")
        runner_ctor = (auto_scheduler.LocalRPCMeasureContext
                       if enable_autoscheduler else autotvm.LocalRunner)
        local_server = runner_ctor(
            number=number,
            repeat=repeat,
            timeout=timeout,
            min_repeat_ms=min_repeat_ms,
        )

        # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object.
        if enable_autoscheduler:
            runner = local_server.runner
        else:
            runner = local_server

    if enable_autoscheduler:

        tasks, weights = autoscheduler_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=desired_layout,
            hardware_params=hardware_params,
            include_simple_tasks=include_simple_tasks,
        )

        # Create the autoscheduler tuning options
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=trials,
            measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)],
            runner=runner,
            early_stopping=early_stopping,
        )

        logger.info("Autoscheduling with configuration: %s", tuning_options)

        # Schedule the tasks (i.e., produce a schedule for each task)
        schedule_tasks(tasks, weights, tuning_options, prior_records,
                       log_estimated_latency)
    else:
        tasks = autotvm_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=desired_layout,
        )

        # In autotvm, trials is specified per task. We can convert the per-model input
        # provided to per-task trials by dividing by the number of tasks.
        trials = int(trials / len(tasks))
        logger.info("Autotuning with %d trials per task.", trials)

        tuning_options = {
            "tuner":
            tuner,
            "trials":
            trials,
            "early_stopping":
            early_stopping,
            "measure_option":
            autotvm.measure_option(
                builder=autotvm.LocalBuilder(build_func="default"),
                runner=runner),
            "tuning_records":
            prior_records,
        }
        logger.info("Autotuning with configuration: %s", tuning_options)

        tune_tasks(tasks, tuning_records, **tuning_options)

    return tuning_records
Esempio n. 26
0
def _build_for_device(input_mod, target, target_host):
    """Build the lowered functions for a device with the given compilation
    target.

    Parameters
    ----------
    input_mod : IRModule
        The schedule to be built.

    target : str or :any:`tvm.target.Target`
        The target and option of the compilation.

    target_host : str or :any:`tvm.target.Target`
        The host compilation target.

    Returns
    -------
    fhost : IRModule
        The host IRModule.

    mdev : tvm.module
        A module that contains device code.
    """
    target = Target(target)
    target_host = Target(target_host)
    device_type = ndarray.context(target.kind.name, 0).device_type

    mod_mixed = input_mod
    mod_mixed = tvm.tir.transform.Apply(
        lambda f: f.with_attr("target", target))(mod_mixed)

    opt_mixed = [tvm.tir.transform.VerifyMemory()]
    if len(mod_mixed.functions) == 1:
        opt_mixed += [
            tvm.tir.transform.Apply(
                lambda f: f.with_attr("tir.is_entry_func", True))
        ]

    if PassContext.current().config.get("tir.detect_global_barrier", False):
        opt_mixed += [tvm.tir.transform.ThreadSync("global")]
    opt_mixed += [
        tvm.tir.transform.ThreadSync("shared"),
        tvm.tir.transform.ThreadSync("warp"),
        tvm.tir.transform.InferFragment(),
        tvm.tir.transform.LowerThreadAllreduce(),
        tvm.tir.transform.MakePackedAPI(),
        tvm.tir.transform.SplitHostDevice(),
    ]
    mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed)

    # device optimizations
    opt_device = tvm.transform.Sequential([
        tvm.tir.transform.Filter(
            lambda f: "calling_conv" in f.attrs and f.attrs[
                "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH),
        tvm.tir.transform.LowerWarpMemory(),
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.LowerDeviceStorageAccessInfo(),
        tvm.tir.transform.LowerCustomDatatypes(),
        tvm.tir.transform.LowerIntrin(),
    ])
    mod_dev = opt_device(mod_mixed)

    # host optimizations
    opt_host = tvm.transform.Sequential([
        tvm.tir.transform.Filter(
            lambda f: "calling_conv" not in f.attrs or f.attrs[
                "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH),
        tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)),
        tvm.tir.transform.LowerTVMBuiltin(),
        tvm.tir.transform.LowerDeviceStorageAccessInfo(),
        tvm.tir.transform.LowerCustomDatatypes(),
        tvm.tir.transform.LowerIntrin(),
        tvm.tir.transform.CombineContextCall(),
    ])
    mod_host = opt_host(mod_mixed)

    if device_type == ndarray.cpu(0).device_type and target_host == target:
        assert len(mod_dev.functions) == 0
    if "gpu" in target.keys and len(mod_dev.functions) == 0:
        warnings.warn(
            "Specified target %s, but cannot find device code, did you do "
            "bind?" % target)

    rt_mod_dev = codegen.build_module(
        mod_dev, target) if len(mod_dev.functions) != 0 else None
    return mod_host, rt_mod_dev
Esempio n. 27
0
    def build(
        self,
        mod,
        target=None,
        target_host=None,
        executor=Executor("graph"),
        runtime=Runtime("cpp"),
        workspace_memory_pools=None,
        params=None,
        mod_name=None,
    ):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm interpreter is used.

        executor : Optional[Executor]
            The executor configuration with which to build the model.
            Defaults to "graph" if no executor specified.

        runtime : Optional[Runtime]
            Runtime configuration to use when building the model.
            Defaults to "cpp" if no runtime specified.

        workspace_memory_pools : Optional[WorkspaceMemoryPools]
            The object that contains an Array of PoolInfo objects
            that hold properties of workspace pools that could be
            used by the inference.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        mod_name: Optional[str]
            The module name we will build

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph executor.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        if target_host is not None:
            warnings.warn(
                "target_host parameter is going to be deprecated. "
                "Please pass in tvm.target.Target(target, host=target_host) instead."
            )
        target = build_target_by_device_type_map(target)
        target, target_host = Target.check_and_update_host_consist(
            target, target_host, target_is_dict_key=False
        )

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False
        )

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler

        mod_name = mangle_module_name(mod_name)

        self._build(mod, target, target_host, executor, runtime, workspace_memory_pools, mod_name)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        mod = self.get_module()
        params = self.get_params()
        executor_config = self.get_graph_json() if str(executor) == "graph" else None

        return executor_config, mod, params
Esempio n. 28
0
def build(inputs,
          args=None,
          target=None,
          target_host=None,
          name="default_function",
          binds=None):
    """Build a function with arguments as signature. Code will be generated
    for devices coupled with target information.

    Parameters
    ----------
    inputs : tvm.te.Schedule, IRModule, or dict of target to IRModule
        The schedule to be built

    args : list of Buffer or Tensor or Var, optional
        The argument lists to the function.

    target : str or :any:`tvm.target.Target`, optional
        The target and option of the compilation.

    target_host : str or :any:`tvm.target.Target` optional
        Host compilation target, if target is device.
        When TVM compiles device specific program such as CUDA,
        we also need host(CPU) side code to interact with the driver
        setup the dimensions and parameters correctly.
        target_host is used to specify the host side codegen target.
        By default, llvm is used if it is enabled,
        otherwise a stackvm intepreter is used.

    name : str, optional
        The name of result function.

    binds : dict, optional
        Dictionary that maps the binding of symbolic buffer to Tensor.
        By default, a new buffer is created for each tensor in the argument.

    Returns
    -------
    ret : tvm.module
        A module that combines both host and device code.

    Examples
    ________
    There are two typical example uses of this function depending on the type
    of the argument `inputs`:
    1. it is an IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s = tvm.te.create_schedule(C.op)
        m = tvm.lower(s, [A, B, C], name="test_add")
        rt_mod = tvm.build(m, target="llvm")

    2. it is a dict of compilation target to IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s1 = tvm.te.create_schedule(C.op)
        with tvm.target.cuda() as cuda_tgt:
          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
          m1 = tvm.lower(s1, [A, B, C], name="test_add1")
          m2 = tvm.lower(s2, [A, B, C], name="test_add2")
          rt_mod = tvm.build({"llvm": m1, "cuda": m2}, target_host="llvm")

    Note
    ----
    See the note on :any:`tvm.target` on target string format.
    """
    if isinstance(inputs, schedule.Schedule):
        if args is None:
            raise ValueError("args must be given for build from schedule")
        input_mod = lower(inputs, args, name=name, binds=binds)
    elif isinstance(inputs, (list, tuple, container.Array)):
        merged_mod = tvm.IRModule({})
        for x in inputs:
            merged_mod.update(x)
        input_mod = merged_mod
    elif isinstance(inputs, tvm.IRModule):
        input_mod = inputs
    elif not isinstance(inputs, (dict, container.Map)):
        raise ValueError(
            f"Inputs must be Schedule, IRModule or dict of target to IRModule, "
            f"but got {type(inputs)}.")

    if not isinstance(inputs, (dict, container.Map)):
        target = Target.current() if target is None else target
        target = target if target else "llvm"
        target_input_mod = {target: input_mod}
    else:
        target_input_mod = inputs

    for tar, mod in target_input_mod.items():
        if not isinstance(tar, (str, Target)):
            raise ValueError("The key of inputs must be str or "
                             "Target when inputs is dict.")
        if not isinstance(mod, tvm.IRModule):
            raise ValueError("inputs must be Schedule, IRModule,"
                             "or dict of str to IRModule.")

    if not target_host:
        for tar, _ in target_input_mod.items():
            tar = Target(tar)
            device_type = ndarray.context(tar.kind.name, 0).device_type
            if device_type == ndarray.cpu(0).device_type:
                target_host = tar
                break
    if not target_host:
        target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"

    mod_host_all = tvm.IRModule({})

    device_modules = []
    for tar, input_mod in target_input_mod.items():
        mod_host, mdev = _build_for_device(input_mod, tar, target_host)
        mod_host_all.update(mod_host)
        device_modules.append(mdev)

    # Generate a unified host module.
    rt_mod_host = codegen.build_module(mod_host_all, target_host)

    # Import all modules.
    for mdev in device_modules:
        if mdev:
            rt_mod_host.import_module(mdev)

    if not isinstance(target_host, Target):
        target_host = Target(target_host)
    if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c"
            and target_host.attrs.get("system-lib", 0).value == 1):
        if target_host.kind.name == "c":
            create_csource_crt_metadata_module = tvm._ffi.get_global_func(
                "runtime.CreateCSourceCrtMetadataModule")
            return create_csource_crt_metadata_module([rt_mod_host],
                                                      target_host)

        if target_host.kind.name == "llvm":
            create_llvm_crt_metadata_module = tvm._ffi.get_global_func(
                "runtime.CreateLLVMCrtMetadataModule")
            return create_llvm_crt_metadata_module([rt_mod_host], target_host)

    return rt_mod_host
Esempio n. 29
0
def build(
    inputs: Union[schedule.Schedule, PrimFunc, IRModule, Mapping[str,
                                                                 IRModule]],
    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
    target: Optional[Union[str, Target]] = None,
    target_host: Optional[Union[str, Target]] = None,
    name: Optional[str] = "default_function",
    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
):
    """Build a function with arguments as signature. Code will be generated
    for devices coupled with target information.

    Parameters
    ----------
    inputs : Union[tvm.te.schedule.Schedule,
        tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]]
        The input to be built

    args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, Var]]]
        The argument lists to the function.

    target : Optional[Union[str, Target]]
        The target and option of the compilation.

    target_host : Optional[Union[str, Target]]
        Host compilation target, if target is device.
        When TVM compiles device specific program such as CUDA,
        we also need host(CPU) side code to interact with the driver
        setup the dimensions and parameters correctly.
        target_host is used to specify the host side codegen target.
        By default, llvm is used if it is enabled,
        otherwise a stackvm interpreter is used.

    name : Optional[str]
        The name of result function.

    binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]]
        Dictionary that maps the binding of symbolic buffer to Tensor.
        By default, a new buffer is created for each tensor in the argument.

    Returns
    -------
    ret : tvm.module
        A module that combines both host and device code.

    Examples
    ________
    There are two typical example uses of this function depending on the type
    of the argument `inputs`:
    1. it is an IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s = tvm.te.create_schedule(C.op)
        m = tvm.lower(s, [A, B, C], name="test_add")
        rt_mod = tvm.build(m, target="llvm")

    2. it is a dict of compilation target to IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s1 = tvm.te.create_schedule(C.op)
        with tvm.target.cuda() as cuda_tgt:
          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
          m1 = tvm.lower(s1, [A, B, C], name="test_add1")
          m2 = tvm.lower(s2, [A, B, C], name="test_add2")
          rt_mod = tvm.build({"llvm": m1, "cuda": m2})

    Note
    ----
    See the note on :any:`tvm.target` on target string format.
    """
    if isinstance(inputs, schedule.Schedule):
        if args is None:
            raise ValueError("args must be given for build from schedule")
        input_mod = lower(inputs, args, name=name, binds=binds)
    elif isinstance(inputs, (list, tuple, container.Array)):
        merged_mod = tvm.IRModule({})
        for x in inputs:
            merged_mod.update(lower(x))
        input_mod = merged_mod
    elif isinstance(inputs, (tvm.IRModule, PrimFunc)):
        input_mod = lower(inputs)
    elif not isinstance(inputs, (dict, container.Map)):
        raise ValueError(
            f"Inputs must be Schedule, IRModule or dict of target to IRModule, "
            f"but got {type(inputs)}.")

    if target_host is not None:
        warnings.warn(
            "target_host parameter is going to be deprecated. "
            "Please pass in tvm.target.Target(target, host=target_host) instead."
        )

    if not isinstance(inputs, (dict, container.Map)):
        target = Target.current() if target is None else target
        target = target if target else "llvm"
        target_input_mod = {target: input_mod}
    else:
        target_input_mod = inputs

    for tar, mod in target_input_mod.items():
        if not isinstance(tar, (str, Target)):
            raise ValueError("The key of inputs must be str or "
                             "Target when inputs is dict.")
        if not isinstance(mod, tvm.IRModule):
            raise ValueError("inputs must be Schedule, IRModule,"
                             "or dict of str to IRModule.")

    target_input_mod, target_host = Target.check_and_update_host_consist(
        target_input_mod, target_host)

    if not target_host:
        for tar, mod in target_input_mod.items():
            tar = Target(tar)
            device_type = ndarray.device(tar.kind.name, 0).device_type
            if device_type == ndarray.cpu(0).device_type:
                target_host = tar
                break
    if not target_host:
        target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"

    target_input_mod, target_host = Target.check_and_update_host_consist(
        target_input_mod, target_host)

    rt_mod_host = _driver_ffi.preprocess_module(target_input_mod, target_host)

    target_input_mod, target_host = Target.check_and_update_host_consist(
        target_input_mod, target_host)

    if not isinstance(target_host, Target):
        target_host = Target(target_host)
    if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c"
            and target_host.attrs.get("system-lib", 0) == 1):
        if target_host.kind.name == "c":
            create_csource_crt_metadata_module = tvm._ffi.get_global_func(
                "runtime.CreateCSourceCrtMetadataModule")
            to_return = create_csource_crt_metadata_module([rt_mod_host],
                                                           target_host)

        elif target_host.kind.name == "llvm":
            create_llvm_crt_metadata_module = tvm._ffi.get_global_func(
                "runtime.CreateLLVMCrtMetadataModule")
            to_return = create_llvm_crt_metadata_module([rt_mod_host],
                                                        target_host)
    else:
        to_return = rt_mod_host

    return OperatorModule.from_module(to_return,
                                      ir_module_by_target=target_input_mod,
                                      name=name)
def measure_candidates(database, builder, runner):
    """Send the candidates to builder and runner for distributed measurement,
    and save the results in a new json database.

    Parameters
    ----------
    database : JSONDatabase
        The database for candidates to be measured.
    builder : Builder
        The builder for building the candidates.
    runner : Runner
        The runner for measuring the candidates.

    Returns
    -------
    None
    """
    candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], []
    context = ms.TuneContext(target=Target(args.target))
    tuning_records = database.get_all_tuning_records()
    for record in tuning_records:
        candidates.append(record.as_measure_candidate())
    with ms.Profiler() as profiler:
        for idx in range(0, len(candidates), args.batch_size):
            batch_candidates = candidates[idx:idx + args.batch_size]
            context._set_measure_candidates(batch_candidates)  # pylint: disable=protected-access
            with ms.Profiler.timeit("build"):
                context._send_to_builder(builder)  # pylint: disable=protected-access
            with ms.Profiler.timeit("run"):
                context._send_to_runner(runner)  # pylint: disable=protected-access
                batch_runner_results = context._join()  # pylint: disable=protected-access
            runner_results.extend(batch_runner_results)
            for i, result in enumerate(context.builder_results):
                if result.error_msg is None:
                    ms.utils.remove_build_dir(result.artifact_path)
                else:
                    build_fail_indices.append(i + idx)
            context._clear_measure_state()  # pylint: disable=protected-access

    model_name, workload_name = database.path_workload.split("/")[-2:]
    record_name = database.path_tuning_record.split("/")[-1]
    new_database = ms.database.JSONDatabase(
        path_workload=os.path.join(args.result_cache_dir, model_name,
                                   workload_name),
        path_tuning_record=os.path.join(args.result_cache_dir, model_name,
                                        record_name),
    )
    workload = tuning_records[0].workload
    new_database.commit_workload(workload.mod)
    for i, (record, result) in enumerate(zip(tuning_records, runner_results)):
        if result.error_msg is None:
            new_database.commit_tuning_record(
                ms.database.TuningRecord(
                    trace=record.trace,
                    workload=workload,
                    run_secs=[v.value for v in result.run_secs],
                    target=Target(args.target),
                ))
        else:
            run_fail_indices.append(i)
    fail_indices_name = workload_name.replace("_workload.json",
                                              "_failed_indices.txt")
    with open(os.path.join(args.result_cache_dir, model_name,
                           fail_indices_name),
              "w",
              encoding="utf8") as file:
        file.write(" ".join([str(n) for n in run_fail_indices]))
    print(
        f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\
            Failed number of builds: {len(build_fail_indices)},\
            Failed number of runs: {len(run_fail_indices)}")