def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs): """Test tuning.""" tgt = "cuda" if "nvidia" in target else target dev = tvm.device(tgt, 0) ref = ( relay.create_executor("vm", mod=relay_mod, device=dev, target=tgt) .evaluate()(*[data_np, weight_np]) .numpy() ) params = {"weight": weight_np} extracted_tasks = extract_task_from_relay(relay_mod, target, params) tune_tasks = list( filter( lambda task: op_name in task.task_name, extracted_tasks, ) ) with tempfile.TemporaryDirectory() as work_dir: database = tune_extracted_tasks( tune_tasks, CONFIG, work_dir=work_dir, sch_rules=lambda: sch_rules, postprocs=lambda: postprocs, ) with ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): lib = relay.build(relay_mod, target=target, params=params) if "cascadelake" in target: asm = lib.lib.get_source("asm") assert "vpdpbusd" in asm runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) runtime.set_input("data", data_np) runtime.run() out = runtime.get_output(0).numpy() np.testing.assert_equal(out, ref)
def tune_relay_auto( mod: IRModule, target: Union[str, Target], config: TuneConfig, work_dir: str, backend: str = "graph", params: Optional[Dict[str, NDArray]] = None, ) -> Union[Module, vm.Executable]: """A wrapper of `tune_relay` but provide a default setting for the config. Parameters ---------- mod : IRModule The module to tune. target : Union[str, Target] The target to tune for. config : TuneConfig The search strategy config. params : Optional[Dict[str, tvm.runtime.NDArray]] The associated parameters of the program work_dir : Optional[str] The working directory to save intermediate results. backend : str = "graph" The backend to use for relay compilation(graph / vm). Returns ------- lib : Union[Module, tvm.runtime.vm.Executable] The built runtime module or vm Executable for the given relay workload. """ target = default_config.target(target) extracted_tasks = extract_task_from_relay(mod, target, params) if config is None: config = TuneConfig( num_trials_per_iter=16, max_trials_global=16 * len(extracted_tasks), ) database = tune_extracted_tasks(extracted_tasks, config, work_dir) relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] with target, autotvm_silencer(), ApplyHistoryBest(database): with PassContext( opt_level=3, config={ "relay.backend.use_meta_schedule": True, "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda", }, ): return relay_build(mod, target=target, params=params)
def _test_bert_int8(target, sch_rules, postprocs): relay_mod, params, input_info = load_quantized_bert_base() relay_mod = relay.transform.FastMath()(relay_mod) extracted_tasks = extract_task_from_relay(relay_mod, target, params) tune_tasks = [] for task in filter( lambda task: "dense" in task.task_name or "batch_matmul" in task. task_name, extracted_tasks, ): relay_func = list(task.mod.functions.values())[0] out_type = relay_func.body.checked_type if out_type.dtype != "float32": tune_tasks.append(task) with tempfile.TemporaryDirectory() as work_dir: database = tune_extracted_tasks( tune_tasks, config, work_dir=work_dir, sch_rules=lambda: sch_rules, postprocs=lambda: postprocs, ) with ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): lib = relay.build(relay_mod, target=target, params=params) dev = tvm.device("cuda" if "nvidia" in target else target, 0) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) inputs = [] for name, shape in input_info: arr = np.random.uniform(1, 10, size=shape).astype("int64") runtime.set_input(name, arr) inputs.append(arr) print(runtime.benchmark(dev, number=1, repeat=50).mean)
def manual_tir_common(do_tune=False): M, N, K = 1024, 1024, 1024 # pylint: disable=invalid-name data_shape = (M, K) weight_shape = (N, K) data_dtype = "uint8" data = relay.var("data", shape=data_shape, dtype=data_dtype) weight = relay.var("weight", shape=weight_shape, dtype="int8") bias = relay.var("bias", shape=(weight_shape[0], ), dtype="int32") # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py) dense = relay.nn.dense(data, weight, out_dtype="int32") bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32") out = relay.nn.batch_matmul( relay.cast(relay.expand_dims(bias_add, 0), "uint8"), relay.cast(relay.expand_dims(bias_add, 0), "int8"), out_dtype="int32", ) relay_mod = tvm.IRModule.from_expr(out) target = "llvm -mcpu=cascadelake -num-cores 4" dev = tvm.device(target, 0) data = np.random.uniform(1, 10, size=(M, K)).astype("uint8") weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8") bias_np = np.random.uniform(1, 10, size=(weight_shape[0], )).astype("int32") ref = (relay.create_executor( "vm", mod=relay_mod, device=dev, target=target).evaluate()(*[data, weight_np, bias_np]).numpy()) params = {"weight": weight_np, "bias": bias_np} if do_tune: extracted_tasks = extract_task_from_relay(relay_mod, target, params) # Filter out tasks that we don't intend to schedule / tune with TIR. tune_tasks = list( filter( lambda task: "dense" in task.task_name, extracted_tasks, )) config = TuneConfig( strategy="replay_trace", num_trials_per_iter=64, max_trials_per_task=20000, max_trials_global=20000, ) with tempfile.TemporaryDirectory() as work_dir: # postprocs=lambda: [] is important to prevent default post processors from # tampering with the manual schedule. database = tune_extracted_tasks( tune_tasks, config, work_dir=work_dir, postprocs=lambda: [], ) else: def schedule_fn(task, sch): if "dense" not in task.task_name: return False block = sch.get_block("compute") # Looks up schedule_rule annotation. # See the comment in test_tune_relay_manual_tir_vnni(). schedule_rule = sch.get(block).annotations["schedule_rule"] assert "dense_vnni" in schedule_rule schedule_dense(block, M, False, sch) return True database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) with ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): # pylint: disable=W0105 """ The log should say Warning: Cannot find workload: tvmgen_default_fused_expand_dims Warning: Cannot find workload: tvmgen_default_fused_cast Warning: Cannot find workload: tvmgen_default_fused_cast_1 Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul This means batch matmul and others are scheduled by TE, and dense (the one not warned) is found in the meta schedule tuning database during ApplyHistoryBest """ # pylint: enable=W0105 lib = relay.build(relay_mod, target=target, params=params) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) runtime.set_input("data", data) runtime.run() out = runtime.get_output(0).numpy() np.testing.assert_equal(out, ref)