def is_auto_scheduler_enabled(): """Return whether the auto-scheduler is enabled. Parameters ---------- enabled: bool Whether the auto-scheduler is enabled """ return PassContext.current().config.get( "relay.backend.use_auto_scheduler", False, ) or PassContext.current().config.get( "relay.backend.use_meta_schedule", False, )
def transform_module(self, mod, _): """Invokes the pass""" # TODO(@jroesch): Is there a way to do one shot initialization? # can we have def pass_init? mod.import_from_std("core.rly") mod = InferType()(mod) assert isinstance(self.targets, (dict, container.Map)) if len(self.targets) > 1: pass_ctx = PassContext.current() if "relay.fallback_device_type" in pass_ctx.config: fallback_ctx = nd.context( pass_ctx.config["relay.fallback_device_type"]) else: fallback_ctx = cpu(0) ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0)) else: if isinstance(self.targets, dict): dev = list(self.targets.keys())[0] else: dev, _ = self.targets.items()[0] ca = context_analysis(mod, nd.context(dev.value)) # The following code can be used for debugging the module after # annotation. # print(mod.astext(show_meta_data=False, annotate=mk_analysis_annotator(ca))) gv_funcs = mod.functions for gv, f in gv_funcs.items(): ea = ManifestAllocPass(self.target_host, ca) f = ea.visit(f) mod.update_func(gv, f) return mod
def build(self, mod, target=None, target_host=None, params=None): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. Returns ------- graph_json : str The json string that can be accepted by graph runtime. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ target = _update_target(target) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler self._build(mod, target, target_host) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts graph_json = self.get_json() mod = self.get_module() params = self.get_params() return graph_json, mod, params
def build_tir_func(func): func = func.with_attr("global_symbol", "main") pass_ctx = PassContext.current() if pass_ctx.config.get("tir.noalias", True): func = func.with_attr("tir.noalias", True) mod = tvm.IRModule({"main": func}) func = tvm.build(mod) return func
def form_irmodule(sch, args, name, binds): """According to the given schedule, form a function. Parameters ---------- sch : tvm.te.schedule.Schedule The given scheduler to form the raw body args : list of Buffer or Tensor or Var The argument lists to the function. name : str The name of result function. binds : dict of :any:`Tensor` to :any:`Buffer`, optional The binds information Returns ------- The body formed according to the given schedule """ # normalize schedule first pass_ctx = PassContext.current() sch = sch.normalize() bounds = schedule.InferBound(sch) stmt = schedule.ScheduleOps(sch, bounds) compact = schedule.VerifyCompactBuffer(stmt) binds, arg_list = get_binds(args, compact, binds) stmt = schedule.SchedulePostProcRewriteForTensorCore(stmt, sch, binds) func = schedule.SchedulePostProcToPrimFunc(arg_list, stmt, binds) func = func.with_attr("global_symbol", name) if pass_ctx.config.get("tir.noalias", True): func = func.with_attr("tir.noalias", True) return tvm.IRModule({name: func})
def _build_for_device(input_mod, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- input_mod : IRModule The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : IRModule The host IRModule. mdev : tvm.module A module that contains device code. """ target, target_host = Target.check_and_update_host_consist( target, target_host) device_type = ndarray.device(target.kind.name, 0).device_type mod_mixed = input_mod mod_mixed = tvm.tir.transform.Apply( lambda f: f.with_attr("target", target))(mod_mixed) opt_mixed = [tvm.tir.transform.VerifyMemory()] if len(mod_mixed.functions) == 1: opt_mixed += [ tvm.tir.transform.Apply( lambda f: f.with_attr("tir.is_entry_func", True)) ] if PassContext.current().config.get("tir.detect_global_barrier", False): opt_mixed += [tvm.tir.transform.ThreadSync("global")] opt_mixed += [ tvm.tir.transform.ThreadSync("shared"), tvm.tir.transform.ThreadSync("warp"), tvm.tir.transform.InferFragment(), tvm.tir.transform.LowerThreadAllreduce(), tvm.tir.transform.MakePackedAPI(), tvm.tir.transform.SplitHostDevice(), ] mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed) # device optimizations opt_device = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" in f.attrs and f.attrs[ "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.Simplify(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), ]) mod_dev = opt_device(mod_mixed) # host optimizations opt_host = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" not in f.attrs or f.attrs[ "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall(), ]) mod_host = opt_host(mod_mixed) if device_type == ndarray.cpu(0).device_type and target_host == target: assert len(mod_dev.functions) == 0 if "gpu" in target.keys and len(mod_dev.functions) == 0: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) rt_mod_dev = codegen.build_module( mod_dev, target) if len(mod_dev.functions) != 0 else None return mod_host, rt_mod_dev
def lower(sch, args, name="main", binds=None, simple_mode=False): """Lowering step before build into target. Parameters ---------- sch : tvm.te.schedule.Schedule The schedule to be built args : list of Buffer or Tensor or Var The argument lists to the function. name : str, optional The name of result function. binds : dict of :any:`Tensor` to :any:`Buffer`, optional Dictionary that maps the Tensor to Buffer which specified the data layout requirement of the function. By default, a new compact buffer is created for each tensor in the argument. simple_mode : bool, optional Whether only output simple and compact statement, this will skip LoopPartition, api wrapper generation and Unrolling. Returns ------- m : IRModule or Stmt The result IRModule, if simple_mode=False Then the Stmt before make api is returned. """ # config setup pass_ctx = PassContext.current() instrument_bound_checkers = bool( pass_ctx.config.get("tir.instrument_bound_checkers", False)) disable_vectorize = bool( pass_ctx.config.get("tir.disable_vectorize", False)) add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", []) lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0] lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1] lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2] lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2] # Phase 0 if isinstance(sch, schedule.Schedule): mod = form_irmodule(sch, args, name, binds) else: mod = sch pass_list = lower_phase0 # Phase 1 pass_list += [ tvm.tir.transform.InjectPrefetch(), tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers), tvm.tir.transform.BF16Legalize(), tvm.tir.transform.NarrowDataType(32), tvm.tir.transform.Simplify(), ] pass_list += lower_phase1 # Phase 2 if not simple_mode: pass_list += [(tvm.tir.transform.LoopPartition())] pass_list += [ tvm.tir.transform.VectorizeLoop(not disable_vectorize), tvm.tir.transform.InjectVirtualThread(), tvm.tir.transform.InjectDoubleBuffer(), tvm.tir.transform.StorageRewrite(), tvm.tir.transform.UnrollLoop(), ] pass_list += lower_phase2 # Phase 3 pass_list += [ tvm.tir.transform.Simplify(), tvm.tir.transform.RemoveNoOp(), ] pass_list += [tvm.tir.transform.RewriteUnsafeSelect()] pass_list += [tvm.tir.transform.HoistIfThenElse()] pass_list += lower_phase3 # Instrument BoundCheckers if instrument_bound_checkers: pass_list += [tvm.tir.transform.InstrumentBoundCheckers()] optimize = tvm.transform.Sequential(pass_list) mod = optimize(mod) return mod
def build(self, mod, target=None, target_host=None, params=None, executor="graph", mod_name=None): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. executor: str[Optional] The type of executor to be used in order to run the model: - If "graph" is specified, then the graph_executor will be used - If "aot" is specified, then the aot_executor will be used mod_name: Optional[str] The module name we will build Returns ------- graph_json : str The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ target = build_target_by_device_type_map(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler mod_name = mangle_module_name(mod_name) self._build(mod, target, target_host, executor, mod_name) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts mod = self.get_module() params = self.get_params() executor_config = self.get_graph_json( ) if executor == "graph" else None return executor_config, mod, params
def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True): """Select the best implementation from the op strategy. If use_autotvm is True, it'll first try to find the best implementation based on AutoTVM profile results. If no AutoTVM profile result is found, it'll choose the implementation with highest plevel. If use_autotvm is False, it'll directly choose the implementation with highest plevel. Note that this function doesn't support op with symbolic input shapes. Parameters ---------- op : tvm.ir.Op Relay operator. attrs : object The op attribute. inputs : List[tvm.te.Tensor] Input tensors to the op. out_type : relay.Type The output type. target : tvm.target.Target The target to compile the op. use_autotvm : bool Whether query AutoTVM to pick the best. Returns ------- ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor]) The best op implementation and the corresponding output tensors. """ all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) best_plevel_impl = max(all_impls, key=lambda x: x.plevel) # Disable autotvm if auto_scheduler is enabled. # (i.e., always return the implementation with the highest priority for auto-scheduler). if PassContext.current().config.get("relay.backend.use_auto_scheduler", False): use_autotvm = False # If not use autotvm, always return the implementation with the highest priority if not use_autotvm: logger.info( "Using %s for %s based on highest priority (%d)", best_plevel_impl.name, op.name, best_plevel_impl.plevel, ) outs = best_plevel_impl.compute(attrs, inputs, out_type) return best_plevel_impl, outs # Otherwise, try autotvm templates outputs = {} workloads = {} best_autotvm_impl = None best_cfg = None dispatch_ctx = autotvm.task.DispatchContext.current old_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = True for impl in all_impls: outs = impl.compute(attrs, inputs, out_type) outputs[impl] = outs workload = autotvm.task.get_workload(outs) workloads[impl] = workload if workload is None: # Not an AutoTVM tunable implementation continue cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # Skip fallback config continue logger.info("Implementation %s for %s has cost %.2e", impl.name, op.name, cfg.cost) if best_cfg is None or best_cfg.cost > cfg.cost: best_autotvm_impl = impl best_cfg = cfg autotvm.GLOBAL_SCOPE.silent = old_silent if best_autotvm_impl: # The best autotvm implementation definitely doesn't use fallback config logger.info( "Using %s for %s based on lowest cost (%.2e)", best_autotvm_impl.name, op.name, best_cfg.cost, ) return best_autotvm_impl, outputs[best_autotvm_impl] # Use the implementation with highest plevel if workloads[best_plevel_impl] is not None: msg = ( "Cannot find tuning records for:\n target=%s\n key=%s\n" "TVM will apply a default schedule which may negatively impact performance." % (target, workloads[best_plevel_impl])) if (not autotvm.env.GLOBAL_SCOPE.silent and msg not in autotvm.task.DispatchContext.warning_messages): autotvm.task.DispatchContext.warning_messages.add(msg) global _first_warning if _first_warning: _first_warning = False info_msg = ( "One or more operators have not been tuned. Please tune your model " "for better performance. Use DEBUG logging level to see more details." ) autotvm_logger.warning(info_msg) autotvm_logger.debug(msg) logger.info( "Using %s for %s based on highest priority (%s)", best_plevel_impl.name, op.name, best_plevel_impl.plevel, ) return best_plevel_impl, outputs[best_plevel_impl]
def conv2d_winograd_without_weight_transfrom_strategy_cuda( attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda", ) elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _, _ = judge_winograd( N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=True, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_tensorcore_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform ), name= "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_direct_without_weight_transform ), name= "conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) if PassContext.current().config.get("relay.backend.use_auto_scheduler", False): strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8"): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if ((2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment ( judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5, ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20, ) # register auto-scheduler implementations use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) if use_auto_scheduler and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWNC": assert kernel_layout in [ "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i" ] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple( kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"] if ((N % 16 == 0 and in_channels % 16 == 0 and out_channels % 16 == 0) or (N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0) or (N % 32 == 0 and in_channels % 16 == 0 and out_channels % 8 == 0) and (data.dtype in tensorcore_dtypes and kernel.dtype in tensorcore_dtypes)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20, ) else: raise RuntimeError("Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule.") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) cudnn_impl = True if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def build( self, mod, target=None, target_host=None, executor=Executor("graph"), runtime=Runtime("cpp"), workspace_memory_pools=None, params=None, mod_name=None, ): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm interpreter is used. executor : Optional[Executor] The executor configuration with which to build the model. Defaults to "graph" if no executor specified. runtime : Optional[Runtime] Runtime configuration to use when building the model. Defaults to "cpp" if no runtime specified. workspace_memory_pools : Optional[WorkspaceMemoryPools] The object that contains an Array of PoolInfo objects that hold properties of workspace pools that could be used by the inference. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. mod_name: Optional[str] The module name we will build Returns ------- graph_json : str The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) target = build_target_by_device_type_map(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler or old_autotvm_silent mod_name = mangle_module_name(mod_name) self._build(mod, target, target_host, executor, runtime, workspace_memory_pools, mod_name) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts mod = self.get_module() params = self.get_params() executor_config = self.get_graph_json() if str( executor) == "graph" else None return executor_config, mod, params
def lower( inputs: Union[schedule.Schedule, PrimFunc, IRModule], args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, name: str = "main", binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, simple_mode: bool = False, ) -> IRModule: """Lowering step before build into target. Parameters ---------- input : Union[schedule.Schedule, PrimFunc, IRModule] The TE schedule or TensorIR PrimFunc/IRModule to be built args : Optional[List[Union[Buffer, tensor.Tensor, Var]]] The argument lists to the function for TE schedule. It should be None if we want to lower TensorIR. name : str The name of result function. binds : Optional[Mapping[tensor.Tensor, Buffer]] Dictionary that maps the Tensor to Buffer which specified the data layout requirement of the function. By default, a new compact buffer is created for each tensor in the argument. simple_mode : bool Whether only output simple and compact statement, this will skip LoopPartition, api wrapper generation and Unrolling. Returns ------- m : IRModule The result IRModule """ # config setup pass_ctx = PassContext.current() instrument_bound_checkers = bool( pass_ctx.config.get("tir.instrument_bound_checkers", False)) disable_vectorize = bool( pass_ctx.config.get("tir.disable_vectorize", False)) add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", []) lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0] lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1] lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2] lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2] # Phase 0 pass_list = lower_phase0 is_legacy_te_schedule: bool = False if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError( "args must be given for lowering from TE schedule") mod = form_irmodule(inputs, args, name, binds) is_legacy_te_schedule = True elif isinstance(inputs, PrimFunc): func = inputs.with_attr("global_symbol", name) if pass_ctx.config.get("tir.noalias", True): func = func.with_attr("tir.noalias", True) mod = tvm.IRModule({name: func}) elif isinstance(inputs, IRModule): mod = inputs else: raise TypeError( f"tvm.lower expected te.Schedule, PrimFunc or IRModule, but got {type(inputs)}" ) # Phase 1 if is_legacy_te_schedule: pass_list += [ tvm.tir.transform.InjectPrefetch(), tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers), ] else: pass_list += [ tvm.tir.transform.LowerInitBlock(), tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(), tvm.tir.transform.ConvertBlocksToOpaque(), tvm.tir.transform.CompactBufferAllocation(), tvm.tir.transform.FlattenBuffer(), ] pass_list += [ tvm.tir.transform.BF16Legalize(), tvm.tir.transform.NarrowDataType(32), tvm.tir.transform.Simplify(), ] pass_list += lower_phase1 # Phase 2 if not simple_mode: pass_list += [(tvm.tir.transform.LoopPartition())] pass_list += [ tvm.tir.transform.VectorizeLoop(not disable_vectorize), tvm.tir.transform.InjectVirtualThread(), tvm.tir.transform.InjectDoubleBuffer(), tvm.tir.transform.StorageRewrite(), tvm.tir.transform.UnrollLoop(), ] pass_list += lower_phase2 # Phase 3 pass_list += [ tvm.tir.transform.Simplify(), tvm.tir.transform.RemoveNoOp(), ] pass_list += [tvm.tir.transform.RewriteUnsafeSelect()] pass_list += [tvm.tir.transform.HoistIfThenElse()] pass_list += lower_phase3 # Instrument BoundCheckers if instrument_bound_checkers: pass_list += [tvm.tir.transform.InstrumentBoundCheckers()] optimize = tvm.transform.Sequential(pass_list) mod = optimize(mod) return mod