コード例 #1
0
def is_auto_scheduler_enabled():
    """Return whether the auto-scheduler is enabled.

    Parameters
    ----------
    enabled: bool
        Whether the auto-scheduler is enabled
    """
    return PassContext.current().config.get(
        "relay.backend.use_auto_scheduler",
        False,
    ) or PassContext.current().config.get(
        "relay.backend.use_meta_schedule",
        False,
    )
コード例 #2
0
    def transform_module(self, mod, _):
        """Invokes the pass"""
        # TODO(@jroesch): Is there a way to do one shot initialization?
        # can we have def pass_init?
        mod.import_from_std("core.rly")
        mod = InferType()(mod)

        assert isinstance(self.targets, (dict, container.Map))
        if len(self.targets) > 1:
            pass_ctx = PassContext.current()
            if "relay.fallback_device_type" in pass_ctx.config:
                fallback_ctx = nd.context(
                    pass_ctx.config["relay.fallback_device_type"])
            else:
                fallback_ctx = cpu(0)
            ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0))
        else:
            if isinstance(self.targets, dict):
                dev = list(self.targets.keys())[0]
            else:
                dev, _ = self.targets.items()[0]
            ca = context_analysis(mod, nd.context(dev.value))

        # The following code can be used for debugging the module after
        # annotation.
        # print(mod.astext(show_meta_data=False, annotate=mk_analysis_annotator(ca)))

        gv_funcs = mod.functions
        for gv, f in gv_funcs.items():
            ea = ManifestAllocPass(self.target_host, ca)
            f = ea.visit(f)
            mod.update_func(gv, f)
        return mod
コード例 #3
0
    def build(self, mod, target=None, target_host=None, params=None):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph runtime.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        target = _update_target(target)

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False)

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler

        self._build(mod, target, target_host)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        graph_json = self.get_json()
        mod = self.get_module()
        params = self.get_params()

        return graph_json, mod, params
コード例 #4
0
ファイル: test_tir_base.py プロジェクト: yhcvb/incubator-tvm
def build_tir_func(func):
    func = func.with_attr("global_symbol", "main")
    pass_ctx = PassContext.current()
    if pass_ctx.config.get("tir.noalias", True):
        func = func.with_attr("tir.noalias", True)
    mod = tvm.IRModule({"main": func})
    func = tvm.build(mod)
    return func
コード例 #5
0
def form_irmodule(sch, args, name, binds):
    """According to the given schedule, form a function.

    Parameters
    ----------
    sch : tvm.te.schedule.Schedule
        The given scheduler to form the raw body

    args : list of Buffer or Tensor or Var
        The argument lists to the function.

    name : str
        The name of result function.

    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
        The binds information

    Returns
    -------
    The body formed according to the given schedule
    """
    # normalize schedule first
    pass_ctx = PassContext.current()
    sch = sch.normalize()
    bounds = schedule.InferBound(sch)
    stmt = schedule.ScheduleOps(sch, bounds)

    compact = schedule.VerifyCompactBuffer(stmt)
    binds, arg_list = get_binds(args, compact, binds)

    stmt = schedule.SchedulePostProcRewriteForTensorCore(stmt, sch, binds)
    func = schedule.SchedulePostProcToPrimFunc(arg_list, stmt, binds)

    func = func.with_attr("global_symbol", name)

    if pass_ctx.config.get("tir.noalias", True):
        func = func.with_attr("tir.noalias", True)
    return tvm.IRModule({name: func})
コード例 #6
0
def _build_for_device(input_mod, target, target_host):
    """Build the lowered functions for a device with the given compilation
    target.

    Parameters
    ----------
    input_mod : IRModule
        The schedule to be built.

    target : str or :any:`tvm.target.Target`
        The target and option of the compilation.

    target_host : str or :any:`tvm.target.Target`
        The host compilation target.

    Returns
    -------
    fhost : IRModule
        The host IRModule.

    mdev : tvm.module
        A module that contains device code.
    """
    target, target_host = Target.check_and_update_host_consist(
        target, target_host)
    device_type = ndarray.device(target.kind.name, 0).device_type

    mod_mixed = input_mod
    mod_mixed = tvm.tir.transform.Apply(
        lambda f: f.with_attr("target", target))(mod_mixed)

    opt_mixed = [tvm.tir.transform.VerifyMemory()]
    if len(mod_mixed.functions) == 1:
        opt_mixed += [
            tvm.tir.transform.Apply(
                lambda f: f.with_attr("tir.is_entry_func", True))
        ]

    if PassContext.current().config.get("tir.detect_global_barrier", False):
        opt_mixed += [tvm.tir.transform.ThreadSync("global")]
    opt_mixed += [
        tvm.tir.transform.ThreadSync("shared"),
        tvm.tir.transform.ThreadSync("warp"),
        tvm.tir.transform.InferFragment(),
        tvm.tir.transform.LowerThreadAllreduce(),
        tvm.tir.transform.MakePackedAPI(),
        tvm.tir.transform.SplitHostDevice(),
    ]
    mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed)

    # device optimizations
    opt_device = tvm.transform.Sequential([
        tvm.tir.transform.Filter(
            lambda f: "calling_conv" in f.attrs and f.attrs[
                "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH),
        tvm.tir.transform.LowerWarpMemory(),
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.LowerDeviceStorageAccessInfo(),
        tvm.tir.transform.LowerCustomDatatypes(),
        tvm.tir.transform.LowerIntrin(),
    ])
    mod_dev = opt_device(mod_mixed)

    # host optimizations
    opt_host = tvm.transform.Sequential([
        tvm.tir.transform.Filter(
            lambda f: "calling_conv" not in f.attrs or f.attrs[
                "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH),
        tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)),
        tvm.tir.transform.LowerTVMBuiltin(),
        tvm.tir.transform.LowerDeviceStorageAccessInfo(),
        tvm.tir.transform.LowerCustomDatatypes(),
        tvm.tir.transform.LowerIntrin(),
        tvm.tir.transform.CombineContextCall(),
    ])
    mod_host = opt_host(mod_mixed)

    if device_type == ndarray.cpu(0).device_type and target_host == target:
        assert len(mod_dev.functions) == 0
    if "gpu" in target.keys and len(mod_dev.functions) == 0:
        warnings.warn(
            "Specified target %s, but cannot find device code, did you do "
            "bind?" % target)

    rt_mod_dev = codegen.build_module(
        mod_dev, target) if len(mod_dev.functions) != 0 else None
    return mod_host, rt_mod_dev
コード例 #7
0
def lower(sch, args, name="main", binds=None, simple_mode=False):
    """Lowering step before build into target.

    Parameters
    ----------
    sch : tvm.te.schedule.Schedule
        The schedule to be built

    args : list of Buffer or Tensor or Var
        The argument lists to the function.

    name : str, optional
        The name of result function.

    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
        Dictionary that maps the Tensor to Buffer which specified the data layout
        requirement of the function. By default, a new compact buffer is created
        for each tensor in the argument.

    simple_mode : bool, optional
        Whether only output simple and compact statement, this will skip
        LoopPartition, api wrapper generation and Unrolling.

    Returns
    -------
    m : IRModule or Stmt
       The result IRModule, if simple_mode=False
       Then the Stmt before make api is returned.
    """
    # config setup
    pass_ctx = PassContext.current()
    instrument_bound_checkers = bool(
        pass_ctx.config.get("tir.instrument_bound_checkers", False))
    disable_vectorize = bool(
        pass_ctx.config.get("tir.disable_vectorize", False))
    add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", [])

    lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0]
    lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1]
    lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2]
    lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2]

    # Phase 0
    if isinstance(sch, schedule.Schedule):
        mod = form_irmodule(sch, args, name, binds)
    else:
        mod = sch

    pass_list = lower_phase0
    # Phase 1
    pass_list += [
        tvm.tir.transform.InjectPrefetch(),
        tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers),
        tvm.tir.transform.BF16Legalize(),
        tvm.tir.transform.NarrowDataType(32),
        tvm.tir.transform.Simplify(),
    ]
    pass_list += lower_phase1

    # Phase 2
    if not simple_mode:
        pass_list += [(tvm.tir.transform.LoopPartition())]

    pass_list += [
        tvm.tir.transform.VectorizeLoop(not disable_vectorize),
        tvm.tir.transform.InjectVirtualThread(),
        tvm.tir.transform.InjectDoubleBuffer(),
        tvm.tir.transform.StorageRewrite(),
        tvm.tir.transform.UnrollLoop(),
    ]
    pass_list += lower_phase2

    # Phase 3
    pass_list += [
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.RemoveNoOp(),
    ]

    pass_list += [tvm.tir.transform.RewriteUnsafeSelect()]
    pass_list += [tvm.tir.transform.HoistIfThenElse()]
    pass_list += lower_phase3

    # Instrument BoundCheckers
    if instrument_bound_checkers:
        pass_list += [tvm.tir.transform.InstrumentBoundCheckers()]

    optimize = tvm.transform.Sequential(pass_list)
    mod = optimize(mod)
    return mod
コード例 #8
0
    def build(self,
              mod,
              target=None,
              target_host=None,
              params=None,
              executor="graph",
              mod_name=None):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        executor: str[Optional]
            The type of executor to be used in order to run the model:
            - If "graph" is specified, then the graph_executor will be used
            - If "aot" is specified, then the aot_executor will be used

        mod_name: Optional[str]
            The module name we will build

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph executor.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        target = build_target_by_device_type_map(target)
        target, target_host = Target.check_and_update_host_consist(
            target, target_host, target_is_dict_key=False)

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False)

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler

        mod_name = mangle_module_name(mod_name)

        self._build(mod, target, target_host, executor, mod_name)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        mod = self.get_module()
        params = self.get_params()
        executor_config = self.get_graph_json(
        ) if executor == "graph" else None

        return executor_config, mod, params
コード例 #9
0
def select_implementation(op,
                          attrs,
                          inputs,
                          out_type,
                          target,
                          use_autotvm=True):
    """Select the best implementation from the op strategy.

    If use_autotvm is True, it'll first try to find the best implementation
    based on AutoTVM profile results. If no AutoTVM profile result is found,
    it'll choose the implementation with highest plevel.

    If use_autotvm is False, it'll directly choose the implementation with
    highest plevel.

    Note that this function doesn't support op with symbolic input shapes.

    Parameters
    ----------
    op : tvm.ir.Op
        Relay operator.

    attrs : object
        The op attribute.

    inputs : List[tvm.te.Tensor]
        Input tensors to the op.

    out_type : relay.Type
        The output type.

    target : tvm.target.Target
        The target to compile the op.

    use_autotvm : bool
        Whether query AutoTVM to pick the best.

    Returns
    -------
    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
        The best op implementation and the corresponding output tensors.
    """
    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
    best_plevel_impl = max(all_impls, key=lambda x: x.plevel)

    # Disable autotvm if auto_scheduler is enabled.
    # (i.e., always return the implementation with the highest priority for auto-scheduler).
    if PassContext.current().config.get("relay.backend.use_auto_scheduler",
                                        False):
        use_autotvm = False

    # If not use autotvm, always return the implementation with the highest priority
    if not use_autotvm:
        logger.info(
            "Using %s for %s based on highest priority (%d)",
            best_plevel_impl.name,
            op.name,
            best_plevel_impl.plevel,
        )
        outs = best_plevel_impl.compute(attrs, inputs, out_type)
        return best_plevel_impl, outs

    # Otherwise, try autotvm templates
    outputs = {}
    workloads = {}
    best_autotvm_impl = None
    best_cfg = None
    dispatch_ctx = autotvm.task.DispatchContext.current
    old_silent = autotvm.GLOBAL_SCOPE.silent
    autotvm.GLOBAL_SCOPE.silent = True
    for impl in all_impls:
        outs = impl.compute(attrs, inputs, out_type)
        outputs[impl] = outs
        workload = autotvm.task.get_workload(outs)
        workloads[impl] = workload
        if workload is None:
            # Not an AutoTVM tunable implementation
            continue
        cfg = dispatch_ctx.query(target, workload)
        if cfg.is_fallback:
            # Skip fallback config
            continue
        logger.info("Implementation %s for %s has cost %.2e", impl.name,
                    op.name, cfg.cost)
        if best_cfg is None or best_cfg.cost > cfg.cost:
            best_autotvm_impl = impl
            best_cfg = cfg
    autotvm.GLOBAL_SCOPE.silent = old_silent

    if best_autotvm_impl:
        # The best autotvm implementation definitely doesn't use fallback config
        logger.info(
            "Using %s for %s based on lowest cost (%.2e)",
            best_autotvm_impl.name,
            op.name,
            best_cfg.cost,
        )
        return best_autotvm_impl, outputs[best_autotvm_impl]

    # Use the implementation with highest plevel
    if workloads[best_plevel_impl] is not None:
        msg = (
            "Cannot find tuning records for:\n    target=%s\n    key=%s\n"
            "TVM will apply a default schedule which may negatively impact performance."
            % (target, workloads[best_plevel_impl]))
        if (not autotvm.env.GLOBAL_SCOPE.silent
                and msg not in autotvm.task.DispatchContext.warning_messages):
            autotvm.task.DispatchContext.warning_messages.add(msg)
            global _first_warning
            if _first_warning:
                _first_warning = False
                info_msg = (
                    "One or more operators have not been tuned. Please tune your model "
                    "for better performance. Use DEBUG logging level to see more details."
                )
                autotvm_logger.warning(info_msg)
            autotvm_logger.debug(msg)

    logger.info(
        "Using %s for %s based on highest priority (%s)",
        best_plevel_impl.name,
        op.name,
        best_plevel_impl.plevel,
    )
    return best_plevel_impl, outputs[best_plevel_impl]
コード例 #10
0
def conv2d_winograd_without_weight_transfrom_strategy_cuda(
        attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(
                topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(
                topi.cuda.
                schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda",
        )
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _, _ = judge_winograd(
            N,
            H,
            W,
            alpha,
            alpha,
            CI,
            CO,
            padding,
            stride_h,
            stride_w,
            dilation_h,
            dilation_w,
            data.dtype,
            kernel.dtype,
            pre_flag=True,
        )
        if (target.kind.name == "cuda"
                and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                and judge_winograd_tensorcore):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_tensorcore_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_direct_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )

        if PassContext.current().config.get("relay.backend.use_auto_scheduler",
                                            False):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                naive_schedule,  # this implementation should never be picked by autotvm
                name="conv2d_nhwc_winograd_without_weight_transform",
                plevel=15,
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".
            format(layout))
    return strategy
コード例 #11
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8",
                                                                    "uint8"):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda",
                )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if ((2 < kh < 8 and 2 < kw < 8 and kh == kw)
                    and (stride_h == 1 and stride_w == 1)
                    and (dilation_h == 1 and dilation_w == 1)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda",
            )

            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            (
                judge_winograd_tensorcore,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )
            if judge_winograd_autotvm:
                if (target.kind.name == "cuda"
                        and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                        and judge_winograd_tensorcore):
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5,
                    )
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5,
                    )
            if (target.kind.name == "cuda"
                    and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                    and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or
                         (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or
                         (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nhwc_tensorcore),
                    name="conv2d_nhwc_tensorcore.cuda",
                    plevel=20,
                )

            # register auto-scheduler implementations
            use_auto_scheduler = PassContext.current().config.get(
                "relay.backend.use_auto_scheduler", False)
            if use_auto_scheduler and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )

        elif layout == "HWNC":
            assert kernel_layout in [
                "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"
            ]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
                    kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)

            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
            if ((N % 16 == 0 and in_channels % 16 == 0
                 and out_channels % 16 == 0)
                    or (N % 8 == 0 and in_channels % 16 == 0
                        and out_channels % 32 == 0)
                    or (N % 32 == 0 and in_channels % 16 == 0
                        and out_channels % 8 == 0) and
                (data.dtype in tensorcore_dtypes
                 and kernel.dtype in tensorcore_dtypes)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20,
                )
            else:
                raise RuntimeError("Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule.")
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
                cudnn_impl = True

        if layout == "NCHW":
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda",
            )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
コード例 #12
0
ファイル: build_module.py プロジェクト: lfengad/incubator-tvm
    def build(
        self,
        mod,
        target=None,
        target_host=None,
        executor=Executor("graph"),
        runtime=Runtime("cpp"),
        workspace_memory_pools=None,
        params=None,
        mod_name=None,
    ):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm interpreter is used.

        executor : Optional[Executor]
            The executor configuration with which to build the model.
            Defaults to "graph" if no executor specified.

        runtime : Optional[Runtime]
            Runtime configuration to use when building the model.
            Defaults to "cpp" if no runtime specified.

        workspace_memory_pools : Optional[WorkspaceMemoryPools]
            The object that contains an Array of PoolInfo objects
            that hold properties of workspace pools that could be
            used by the inference.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        mod_name: Optional[str]
            The module name we will build

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph executor.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        if target_host is not None:
            warnings.warn(
                "target_host parameter is going to be deprecated. "
                "Please pass in tvm.target.Target(target, host=target_host) instead."
            )
        target = build_target_by_device_type_map(target)
        target, target_host = Target.check_and_update_host_consist(
            target, target_host, target_is_dict_key=False)

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False)

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler or old_autotvm_silent

        mod_name = mangle_module_name(mod_name)

        self._build(mod, target, target_host, executor, runtime,
                    workspace_memory_pools, mod_name)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        mod = self.get_module()
        params = self.get_params()
        executor_config = self.get_graph_json() if str(
            executor) == "graph" else None

        return executor_config, mod, params
コード例 #13
0
def lower(
    inputs: Union[schedule.Schedule, PrimFunc, IRModule],
    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
    name: str = "main",
    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
    simple_mode: bool = False,
) -> IRModule:
    """Lowering step before build into target.

    Parameters
    ----------
    input : Union[schedule.Schedule, PrimFunc, IRModule]
        The TE schedule or TensorIR PrimFunc/IRModule to be built

    args : Optional[List[Union[Buffer, tensor.Tensor, Var]]]
        The argument lists to the function for TE schedule.
        It should be None if we want to lower TensorIR.

    name : str
        The name of result function.

    binds : Optional[Mapping[tensor.Tensor, Buffer]]
        Dictionary that maps the Tensor to Buffer which specified the data layout
        requirement of the function. By default, a new compact buffer is created
        for each tensor in the argument.

    simple_mode : bool
        Whether only output simple and compact statement, this will skip
        LoopPartition, api wrapper generation and Unrolling.

    Returns
    -------
    m : IRModule
       The result IRModule
    """
    # config setup
    pass_ctx = PassContext.current()
    instrument_bound_checkers = bool(
        pass_ctx.config.get("tir.instrument_bound_checkers", False))
    disable_vectorize = bool(
        pass_ctx.config.get("tir.disable_vectorize", False))
    add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", [])

    lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0]
    lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1]
    lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2]
    lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2]

    # Phase 0
    pass_list = lower_phase0
    is_legacy_te_schedule: bool = False

    if isinstance(inputs, schedule.Schedule):
        if args is None:
            raise ValueError(
                "args must be given for lowering from TE schedule")
        mod = form_irmodule(inputs, args, name, binds)
        is_legacy_te_schedule = True
    elif isinstance(inputs, PrimFunc):
        func = inputs.with_attr("global_symbol", name)
        if pass_ctx.config.get("tir.noalias", True):
            func = func.with_attr("tir.noalias", True)
        mod = tvm.IRModule({name: func})
    elif isinstance(inputs, IRModule):
        mod = inputs
    else:
        raise TypeError(
            f"tvm.lower expected te.Schedule, PrimFunc or IRModule, but got {type(inputs)}"
        )

    # Phase 1
    if is_legacy_te_schedule:
        pass_list += [
            tvm.tir.transform.InjectPrefetch(),
            tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers),
        ]
    else:
        pass_list += [
            tvm.tir.transform.LowerInitBlock(),
            tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(),
            tvm.tir.transform.ConvertBlocksToOpaque(),
            tvm.tir.transform.CompactBufferAllocation(),
            tvm.tir.transform.FlattenBuffer(),
        ]
    pass_list += [
        tvm.tir.transform.BF16Legalize(),
        tvm.tir.transform.NarrowDataType(32),
        tvm.tir.transform.Simplify(),
    ]

    pass_list += lower_phase1

    # Phase 2
    if not simple_mode:
        pass_list += [(tvm.tir.transform.LoopPartition())]

    pass_list += [
        tvm.tir.transform.VectorizeLoop(not disable_vectorize),
        tvm.tir.transform.InjectVirtualThread(),
        tvm.tir.transform.InjectDoubleBuffer(),
        tvm.tir.transform.StorageRewrite(),
        tvm.tir.transform.UnrollLoop(),
    ]
    pass_list += lower_phase2

    # Phase 3
    pass_list += [
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.RemoveNoOp(),
    ]

    pass_list += [tvm.tir.transform.RewriteUnsafeSelect()]
    pass_list += [tvm.tir.transform.HoistIfThenElse()]
    pass_list += lower_phase3

    # Instrument BoundCheckers
    if instrument_bound_checkers:
        pass_list += [tvm.tir.transform.InstrumentBoundCheckers()]

    optimize = tvm.transform.Sequential(pass_list)
    mod = optimize(mod)
    return mod