def test_tuple_passing(): x = relay.var( "x", type_annotation=relay.ty.TupleType( [relay.ty.TensorType((), "int64"), relay.ty.TensorType((), "int64")] ), ) fn = relay.Function([x], relay.expr.TupleGetItem(x, 0)) mod = tvm.IRModule({}) gv = relay.GlobalVar("main") mod[gv] = fn mod = relay.transform.InferType()(mod) dev = tvm.cpu() target = tvm.target.Target("llvm") exec = relay.create_executor(mod=mod, device=dev, target=target) f = exec.evaluate(gv) # First use a Python tuple. out = f((10, 8)) tvm.testing.assert_allclose(out.numpy(), np.array(10)) # Second use a tuple value. value_tuple = container.tuple_object([nd.array(np.array(11)), nd.array(np.array(12))]) out = f(value_tuple) tvm.testing.assert_allclose(out.numpy(), np.array(11))
def test_tuple_passing(): x = relay.var('x', type_annotation=relay.ty.TupleType([ relay.ty.TensorType((), 'int64'), relay.ty.TensorType((), 'int64') ])) fn = relay.Function([x], relay.expr.TupleGetItem(x, 0)) mod = relay.Module({}) gv = relay.GlobalVar('main') mod[gv] = fn mod = relay.transform.InferType()(mod) ctx = tvm.cpu() target = tvm.target.create('llvm') exec = relay.create_executor(mod=mod, ctx=ctx, target=target) f = exec.evaluate(gv) # First use a Python tuple. out = f((10, 8)) tvm.testing.assert_allclose(out.asnumpy(), np.array(10)) # Second use a tuple value. value_tuple = container.tuple_object( [nd.array(np.array(11)), nd.array(np.array(12))]) out = f(value_tuple) tvm.testing.assert_allclose(out.asnumpy(), np.array(11))
def adaptive_evaluator(epsilon, remote, build_result, measure_input, ref_input, number, repeat, min_repeat_ms): # print("####in adaptive evaluator###") func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) flop = measure_input.task.flop # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. args = [ nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info ] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() # break the number*repeat into several batch # print("number=%d, repeat=%d" % (number, repeat)) if repeat * number < 300: # no need to do adaptive evaluator time_f = func.time_evaluator(func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) eva_res = time_f(*args) costs = eva_res.results else: b_size = 50 costs = [] sum_num = 0 max_iter = number * repeat pis = [] bi = 1 rep = 0 flag = True while flag and sum_num < max_iter: time_f = func.time_evaluator(func.entry_name, ctx, number=b_size, repeat=1, min_repeat_ms=min_repeat_ms) b_mean = time_f(*args).mean # print("b_mean:" + str(b_mean)) costs.append(b_mean) sum_num = sum_num + b_size # calculate the flops of per batch: flop/time_mean_batch_i pi = flop / b_mean pis.append(pi) pis_array = np.array(pis) if len(pis_array) > 4: # remove the min and max to reduce variance pis_array.sort() pis_array = pis_array[1:-1] # calculate the coefficient of variation cv = pis_array.std() / pis_array.mean() if bi > 2 and cv < epsilon: # print("\nindex is %d, break at batch#%d, cv=%.10f, cost is %.8f." % (measure_input.config.index, bi, cv, b_mean)) flag = False bi = bi + 1 return costs
def broadcast_to(children, attrs, odtype='float32'): # TODO(@jroesch) export broadcast to? data = children[0] shape = attrs.get_int_tuple('shape') array = numpy.zeros(shape).astype(odtype) rconst = relay.Constant(nd.array(array)) return op.broadcast_to_like(data, rconst)
def set_task(self, task): self.task = task if check_remote(task.target, self.key, self.host, self.port): logger.info("Get devices for measurement successfully!") else: raise RuntimeError( "Cannot get remote devices from the tracker. " "Please check the status of tracker by " "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' " "and make sure you have free devices on the queue status.") if self.check_correctness: # use llvm cpu to generate a reference input/output # this option works for tuning topi, but might not work for you custom op with _target.create("llvm"): s, arg_bufs = task.instantiate(task.config_space.get(0)) self.ref_input = [ np.random.uniform(size=get_const_tuple(x.shape)).astype( x.dtype) for x in arg_bufs ] func = build(s, arg_bufs, "llvm") tvm_buf = [nd.array(x) for x in self.ref_input] func(*tvm_buf) self.ref_output = [x.asnumpy() for x in tvm_buf]
def test_adt_constructor(): arr = nd.array([1, 2, 3]) fields = [arr, arr] y = _container.ADT(0, [arr, arr]) assert len(y) == 2 assert isinstance(y, _container.ADT) y[0:1][-1] == arr assert y.tag == 0 assert isinstance(arr, nd.NDArray)
def test_tuple_object(): x = relay.var('x', type_annotation=relay.ty.TupleType([ relay.ty.TensorType((), 'int32'), relay.ty.TensorType((), 'int32') ])) fn = relay.Function([x], relay.expr.TupleGetItem(x, 0)) mod = tvm.IRModule.from_expr(fn) exe = relay.create_executor(kind="vm", mod=mod, ctx=nd.cpu(), target="llvm") f = exe.evaluate() value_tuple = _container.tuple_object( [nd.array(np.array(11)), nd.array(np.array(12))]) # pass an ADT object to evaluate out = f(value_tuple) tvm.testing.assert_allclose(out.asnumpy(), np.array(11))
def test_function_taking_adt_ref_tuple(): mod = tvm.IRModule() prelude = relay.prelude.Prelude(mod) intrp = create_executor("debug", mod) _, cons, nil = prelude.mod.get_type("List") nil_value = ConstructorValue(nil.tag, [], nil) cons_value = ConstructorValue( cons.tag, [nd.array(np.random.rand(1, 10).astype("float32")), nil_value], cons, ) ref_value = RefValue(nd.array(np.random.rand(1, 10).astype("float32"))) tuple_value = container.tuple_object( [nd.array(np.random.rand(1, 10).astype("float32")) for _ in range(10)]) id_func = intrp.evaluate(prelude.id) res_nil = id_func(nil_value) assert res_nil.tag == nil_value.tag assert len(res_nil.fields) == 0 res_cons = id_func(cons_value) assert res_cons.tag == cons_value.tag assert len(res_cons.fields) == len(cons_value.fields) tvm.testing.assert_allclose(res_cons.fields[0].asnumpy(), cons_value.fields[0].asnumpy()) assert isinstance(res_cons.fields[1], ConstructorValue) assert res_cons.fields[1].tag == nil.tag assert len(res_cons.fields[1].fields) == 0 res_ref = id_func(ref_value) tvm.testing.assert_allclose(res_ref.value.asnumpy(), ref_value.value.asnumpy()) res_tuple = id_func(tuple_value) for i in range(10): tvm.testing.assert_allclose(res_tuple[i].asnumpy(), tuple_value[i].asnumpy())
def run_through_rpc(measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple The argument for request_remote ref_input: List of np.ndarray The reference input used for checking correctness ref_output: List of np.ndarray The reference output used for checking correctness """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA if hasattr(measure_input.target, 'device_name') and \ measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) time_f = func.time_evaluator(func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. args = [ nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info ] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() costs = time_f(*args).results # clean up remote files remote.remove(build_result.filename) remote.remove(os.path.splitext(build_result.filename)[0] + '.so') remote.remove('') if len(costs ) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) # check correctness of output if ref_output: for expected, real in zip(ref_output, args): if not np.allclose(expected, real.asnumpy(), rtol=1e-4): logger.warning("Wrong Answer!") errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]), ) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
def extract_task_from_relay( mod: IRModule, target: Target, params: Optional[Dict[str, NDArray]] = None, *, opt_level: int = 3, pass_config: Optional[Dict[str, Any]] = None, disabled_pass: Optional[List[str]] = None, ) -> List[ExtractedTask]: """Extract tuning tasks from a relay program. Parameters ---------- mod : IRModule The module or function to tune target : tvm.target.Target The compilation target params : Optional[Dict[str, tvm.runtime.NDArray]] The associated parameters of the program opt_level : int The optimization level of the compiler pass_config : Optional[Dict[str, Any]] The pass config of the compiler disabled_pass : Optional[List[str]] The list of disabled passes of the compiler Returns ------- tasks: List[ExtractedTask] The tasks extracted from this network """ # pylint: disable=import-outside-toplevel from tvm.relay import Function as RelayFunc # pylint: enable=import-outside-toplevel extract_task_func = get_global_func( "relay.backend.MetaScheduleExtractTask", allow_missing=False, ) if isinstance(mod, RelayFunc): mod = IRModule.from_expr(mod) if not isinstance(target, Target): target = Target(target) if disabled_pass is None: disabled_pass = [] if pass_config is None: pass_config = {"relay.backend.use_meta_schedule": True} if params is None: params = {} relay_params = {} for name, param in params.items(): if isinstance(param, np.ndarray): param = nd.array(param) relay_params[name] = param with autotvm_silencer(), target, transform.PassContext( opt_level=opt_level, config=pass_config, disabled_pass=disabled_pass, ): return list(extract_task_func(mod, target, relay_params))
def extract_task_from_relay( mod: IRModule, target: Target, params: Optional[Dict[str, NDArray]] = None, *, opt_level: int = 3, pass_config: Optional[Dict[str, Any]] = None, disabled_pass: Optional[List[str]] = None, te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None, ) -> List[ExtractedTask]: """Extract tuning tasks from a relay program. Parameters ---------- mod : IRModule The module or function to tune target : tvm.target.Target The compilation target params : Optional[Dict[str, tvm.runtime.NDArray]] The associated parameters of the program opt_level : int The optimization level of the compiler pass_config : Optional[Dict[str, Any]] The pass config of the compiler disabled_pass : Optional[List[str]] The list of disabled passes of the compiler te_filter_func : Callable[[List[tvm.te.Tensor]], bool] The filter function to filter out the extracted tasks If it's a string, it's the name of the filtering function. Built in functions are - "meta_schedule.DefaultTaskFilter" - "meta_schedule.DefaultTaskFilterAllowExtern" If it's None, it's the default filtering function If it's a callable, it's the filtering function Returns ------- tasks: List[ExtractedTask] The tasks extracted from this network """ # pylint: disable=import-outside-toplevel from tvm import autotvm from tvm.relay import Function as RelayFunc # pylint: enable=import-outside-toplevel if isinstance(te_filter_func, str): te_filter_func = get_global_func(te_filter_func) extract_task_func = get_global_func( "relay.backend.MetaScheduleExtractTask", allow_missing=False, ) if isinstance(mod, RelayFunc): mod = IRModule.from_expr(mod) if not isinstance(target, Target): target = Target(target) if disabled_pass is None: disabled_pass = [] if pass_config is None: pass_config = {"relay.backend.use_meta_schedule": True} if params is None: params = {} relay_params = {} for name, param in params.items(): if isinstance(param, np.ndarray): param = nd.array(param) relay_params[name] = param with target, autotvm_silencer(), transform.PassContext( opt_level=opt_level, config=pass_config, disabled_pass=disabled_pass, ): if target.kind.name != "cuda" and isinstance( autotvm.DispatchContext.current, autotvm.FallbackContext): tophub_context = autotvm.tophub.context(target) else: tophub_context = autotvm.utils.EmptyContext() with tophub_context: return list( extract_task_func(mod, target, relay_params, te_filter_func))
def run_through_rpc( measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None, enable_cpu_cache_flush=False, ): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple The argument for request_remote ref_input: List of np.ndarray The reference input used for checking correctness ref_output: List of np.ndarray The reference output used for checking correctness enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA if ( hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta" ): # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold # the PackedFunc as an object. Currently, we pass function name to work # around it. f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = func.time_evaluator( func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, f_preproc=f_prepare, ) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: try: random_fill = remote.get_function("tvm.contrib.random.random_fill") except AttributeError: raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] for arg in args: random_fill(arg) ctx.sync() costs = time_f(*args).results # clean up remote files remote.remove(build_result.filename) remote.remove(os.path.splitext(build_result.filename)[0] + ".so") remote.remove("") if len(costs) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) # check correctness of output if ref_output: for expected, real in zip(ref_output, args): if not np.allclose(expected, real.asnumpy(), rtol=1e-4): logger.warning("Wrong Answer!") errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[: msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[: msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
def run_through_rpc( measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_kwargs, ref_input, enable_cpu_cache_flush=False, module_loader=None, ): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_kwargs: dict Passed to module_loader(). Ultimately, keyword args to request_remote(). ref_input: List of np.ndarray The reference input used for tuning. Empty for randomly filled input. enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. module_loader: ModuleLoader A function that returns a ContextManager used to establish and teardown the remote session. """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module with module_loader(remote_kwargs, build_result) as (remote, mod): dev = remote.device(str(measure_input.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold # the PackedFunc as an object. Currently, we pass function name to work # around it. f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = mod.time_evaluator( mod.entry_name, dev, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, f_preproc=f_prepare, ) if ref_input: args = [nd.array(x, device=dev) for x in ref_input] else: try: random_fill = remote.get_function( "tvm.contrib.random.random_fill") except AttributeError: raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices") args = [ nd.empty(x[0], x[1], dev) for x in build_result.arg_info ] if "scatter" not in measure_input.task.name: # the index tensor of scatter op cannot be randomly initialized for arg in args: random_fill(arg) dev.sync() costs = time_f(*args).results if len(costs ) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]), ) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)