Esempi in Python per array, esempi in Python per tvm.nd.array

Esempio n. 1

0

Mostra file

File: test_backend_interpreter.py Progetto: juierror/incubator-tvm

def test_tuple_passing():
    x = relay.var(
        "x",
        type_annotation=relay.ty.TupleType(
            [relay.ty.TensorType((), "int64"), relay.ty.TensorType((), "int64")]
        ),
    )

    fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
    mod = tvm.IRModule({})
    gv = relay.GlobalVar("main")
    mod[gv] = fn
    mod = relay.transform.InferType()(mod)

    dev = tvm.cpu()
    target = tvm.target.Target("llvm")
    exec = relay.create_executor(mod=mod, device=dev, target=target)
    f = exec.evaluate(gv)
    # First use a Python tuple.
    out = f((10, 8))
    tvm.testing.assert_allclose(out.numpy(), np.array(10))
    # Second use a tuple value.
    value_tuple = container.tuple_object([nd.array(np.array(11)), nd.array(np.array(12))])
    out = f(value_tuple)
    tvm.testing.assert_allclose(out.numpy(), np.array(11))

Esempio n. 2

0

Mostra file

def test_tuple_passing():
    x = relay.var('x',
                  type_annotation=relay.ty.TupleType([
                      relay.ty.TensorType((), 'int64'),
                      relay.ty.TensorType((), 'int64')
                  ]))

    fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
    mod = relay.Module({})
    gv = relay.GlobalVar('main')
    mod[gv] = fn
    mod = relay.transform.InferType()(mod)

    ctx = tvm.cpu()
    target = tvm.target.create('llvm')
    exec = relay.create_executor(mod=mod, ctx=ctx, target=target)
    f = exec.evaluate(gv)
    # First use a Python tuple.
    out = f((10, 8))
    tvm.testing.assert_allclose(out.asnumpy(), np.array(10))
    # Second use a tuple value.
    value_tuple = container.tuple_object(
        [nd.array(np.array(11)),
         nd.array(np.array(12))])
    out = f(value_tuple)
    tvm.testing.assert_allclose(out.asnumpy(), np.array(11))

Esempio n. 3

0

Mostra file

def adaptive_evaluator(epsilon, remote, build_result, measure_input, ref_input,
                       number, repeat, min_repeat_ms):
    # print("####in adaptive evaluator###")
    func = remote.load_module(os.path.split(build_result.filename)[1])
    ctx = remote.context(str(measure_input.target), 0)
    flop = measure_input.task.flop
    # set input
    if ref_input:
        args = [nd.array(x, ctx=ctx) for x in ref_input]
    else:
        # create empty arrays on the remote device and copy them once.
        # This can avoid some memory issues that make the measurement results unreliable.
        args = [
            nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info
        ]
        args = [nd.array(x, ctx=ctx) for x in args]
        ctx.sync()
    # break the number*repeat into several batch
    # print("number=%d, repeat=%d" % (number, repeat))
    if repeat * number < 300:  # no need to do adaptive evaluator
        time_f = func.time_evaluator(func.entry_name,
                                     ctx,
                                     number=number,
                                     repeat=repeat,
                                     min_repeat_ms=min_repeat_ms)
        eva_res = time_f(*args)
        costs = eva_res.results
    else:
        b_size = 50
        costs = []
        sum_num = 0
        max_iter = number * repeat
        pis = []
        bi = 1
        rep = 0
        flag = True
        while flag and sum_num < max_iter:
            time_f = func.time_evaluator(func.entry_name,
                                         ctx,
                                         number=b_size,
                                         repeat=1,
                                         min_repeat_ms=min_repeat_ms)
            b_mean = time_f(*args).mean
            # print("b_mean:" + str(b_mean))
            costs.append(b_mean)
            sum_num = sum_num + b_size
            # calculate the flops of per batch: flop/time_mean_batch_i
            pi = flop / b_mean
            pis.append(pi)
            pis_array = np.array(pis)
            if len(pis_array) > 4:  # remove the min and max to reduce variance
                pis_array.sort()
                pis_array = pis_array[1:-1]
            # calculate the coefficient of variation
            cv = pis_array.std() / pis_array.mean()
            if bi > 2 and cv < epsilon:
                # print("\nindex is %d, break at batch#%d, cv=%.10f, cost is %.8f." % (measure_input.config.index, bi, cv, b_mean))
                flag = False
            bi = bi + 1
    return costs

Esempio n. 4

0

Mostra file

File: to_relay.py Progetto: zheng-xq/tvm

def broadcast_to(children, attrs, odtype='float32'):
    # TODO(@jroesch) export broadcast to?
    data = children[0]
    shape = attrs.get_int_tuple('shape')
    array = numpy.zeros(shape).astype(odtype)
    rconst = relay.Constant(nd.array(array))
    return op.broadcast_to_like(data, rconst)

Esempio n. 5

0

Mostra file

File: measure_methods.py Progetto: retamia/tvm

    def set_task(self, task):
        self.task = task

        if check_remote(task.target, self.key, self.host, self.port):
            logger.info("Get devices for measurement successfully!")
        else:
            raise RuntimeError(
                "Cannot get remote devices from the tracker. "
                "Please check the status of tracker by "
                "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
                "and make sure you have free devices on the queue status.")

        if self.check_correctness:
            # use llvm cpu to generate a reference input/output
            # this option works for tuning topi, but might not work for you custom op
            with _target.create("llvm"):
                s, arg_bufs = task.instantiate(task.config_space.get(0))
            self.ref_input = [
                np.random.uniform(size=get_const_tuple(x.shape)).astype(
                    x.dtype) for x in arg_bufs
            ]
            func = build(s, arg_bufs, "llvm")
            tvm_buf = [nd.array(x) for x in self.ref_input]
            func(*tvm_buf)
            self.ref_output = [x.asnumpy() for x in tvm_buf]

Esempio n. 6

0

Mostra file

File: test_runtime_container.py Progetto: surya00060/CS6886-Course-Project

def test_adt_constructor():
    arr = nd.array([1, 2, 3])
    fields = [arr, arr]
    y = _container.ADT(0, [arr, arr])

    assert len(y) == 2
    assert isinstance(y, _container.ADT)
    y[0:1][-1] == arr
    assert y.tag == 0
    assert isinstance(arr, nd.NDArray)

Esempio n. 7

0

Mostra file

File: test_runtime_container.py Progetto: surya00060/CS6886-Course-Project

def test_tuple_object():
    x = relay.var('x',
                  type_annotation=relay.ty.TupleType([
                      relay.ty.TensorType((), 'int32'),
                      relay.ty.TensorType((), 'int32')
                  ]))

    fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
    mod = tvm.IRModule.from_expr(fn)

    exe = relay.create_executor(kind="vm",
                                mod=mod,
                                ctx=nd.cpu(),
                                target="llvm")
    f = exe.evaluate()
    value_tuple = _container.tuple_object(
        [nd.array(np.array(11)),
         nd.array(np.array(12))])
    # pass an ADT object to evaluate
    out = f(value_tuple)
    tvm.testing.assert_allclose(out.asnumpy(), np.array(11))

Esempio n. 8

0

Mostra file

File: test_backend_interpreter.py Progetto: yhcvb/incubator-tvm

def test_function_taking_adt_ref_tuple():
    mod = tvm.IRModule()
    prelude = relay.prelude.Prelude(mod)
    intrp = create_executor("debug", mod)
    _, cons, nil = prelude.mod.get_type("List")

    nil_value = ConstructorValue(nil.tag, [], nil)
    cons_value = ConstructorValue(
        cons.tag,
        [nd.array(np.random.rand(1, 10).astype("float32")), nil_value],
        cons,
    )

    ref_value = RefValue(nd.array(np.random.rand(1, 10).astype("float32")))
    tuple_value = container.tuple_object(
        [nd.array(np.random.rand(1, 10).astype("float32")) for _ in range(10)])

    id_func = intrp.evaluate(prelude.id)

    res_nil = id_func(nil_value)
    assert res_nil.tag == nil_value.tag
    assert len(res_nil.fields) == 0

    res_cons = id_func(cons_value)
    assert res_cons.tag == cons_value.tag
    assert len(res_cons.fields) == len(cons_value.fields)
    tvm.testing.assert_allclose(res_cons.fields[0].asnumpy(),
                                cons_value.fields[0].asnumpy())
    assert isinstance(res_cons.fields[1], ConstructorValue)
    assert res_cons.fields[1].tag == nil.tag
    assert len(res_cons.fields[1].fields) == 0

    res_ref = id_func(ref_value)
    tvm.testing.assert_allclose(res_ref.value.asnumpy(),
                                ref_value.value.asnumpy())

    res_tuple = id_func(tuple_value)
    for i in range(10):
        tvm.testing.assert_allclose(res_tuple[i].asnumpy(),
                                    tuple_value[i].asnumpy())

Esempio n. 9

0

Mostra file

File: measure_methods.py Progetto: retamia/tvm

def run_through_rpc(measure_input,
                    build_result,
                    number,
                    repeat,
                    min_repeat_ms,
                    cooldown_interval,
                    remote_args,
                    ref_input=None,
                    ref_output=None):
    """Run a generated library through rpc

    Parameters
    ----------
    measure_input: MeasureInput
        The raw measure input
    build_result: BuildResult
        The result returned from Builder. This contains the path to the generated library.
    number: int
        The number of times to run the generated code for taking average.
        We call these runs as one `repeat` of measurement.
    repeat : int, optional
        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
        where the first one is warm up and will be discarded.
        The returned result contains `repeat` costs,
        each of which is an average of `number` costs.
    min_repeat_ms: int, optional
        The minimum duration of one `repeat` in milliseconds.
        By default, one `repeat` contains `number` runs. If this parameter is set,
        the parameters `number` will be dynamically adjusted to meet the
        minimum duration requirement of one `repeat`.
        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
        will be automatically increased.
    cooldown_interval: float
        The cool down interval between two measurements
    remote_args: Tuple
        The argument for request_remote
    ref_input: List of np.ndarray
        The reference input used for checking correctness
    ref_output: List of np.ndarray
        The reference output used for checking correctness
    """
    if isinstance(build_result, MeasureResult):
        return build_result

    tic = time.time()
    errno = MeasureErrorNo.NO_ERROR
    try:
        # upload built module
        remote = request_remote(*remote_args)
        # Program the FPGA every single time when targeting VTA
        if hasattr(measure_input.target, 'device_name') and \
            measure_input.target.device_name == 'vta':
            # pylint: disable=import-outside-toplevel
            from vta import program_fpga, reconfig_runtime
            program_fpga(remote, None)
            reconfig_runtime(remote)
        remote.upload(build_result.filename)
        func = remote.load_module(os.path.split(build_result.filename)[1])
        ctx = remote.context(str(measure_input.target), 0)
        time_f = func.time_evaluator(func.entry_name,
                                     ctx,
                                     number=number,
                                     repeat=repeat,
                                     min_repeat_ms=min_repeat_ms)

        # set input
        if ref_input:
            args = [nd.array(x, ctx=ctx) for x in ref_input]
        else:
            # create empty arrays on the remote device and copy them once.
            # This can avoid some memory issues that make the measurement results unreliable.
            args = [
                nd.empty(x[0], dtype=x[1], ctx=ctx)
                for x in build_result.arg_info
            ]
            args = [nd.array(x, ctx=ctx) for x in args]
            ctx.sync()

        costs = time_f(*args).results

        # clean up remote files
        remote.remove(build_result.filename)
        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
        remote.remove('')

        if len(costs
               ) > 2:  # remove largest and smallest value to reduce variance
            costs = list(costs)
            costs.sort()
            costs = tuple(costs[1:-1])

        # check correctness of output
        if ref_output:
            for expected, real in zip(ref_output, args):
                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
                    logger.warning("Wrong Answer!")
                    errno = MeasureErrorNo.WRONG_ANSWER
    except TVMError as exc:
        msg = str(exc)
        if "Stack trace returned" in msg:
            msg = msg[:msg.index("Stack trace returned")]
        if "CUDA Source" in msg:
            msg = msg[:msg.index("CUDA Source")]
        costs = (RuntimeError(msg[:1024]), )
        errno = MeasureErrorNo.RUNTIME_DEVICE
    tstamp = time.time()
    time.sleep(cooldown_interval)
    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost,
                         tstamp)

Esempio n. 10

0

Mostra file

def extract_task_from_relay(
    mod: IRModule,
    target: Target,
    params: Optional[Dict[str, NDArray]] = None,
    *,
    opt_level: int = 3,
    pass_config: Optional[Dict[str, Any]] = None,
    disabled_pass: Optional[List[str]] = None,
) -> List[ExtractedTask]:
    """Extract tuning tasks from a relay program.

    Parameters
    ----------
    mod : IRModule
        The module or function to tune
    target : tvm.target.Target
        The compilation target
    params : Optional[Dict[str, tvm.runtime.NDArray]]
        The associated parameters of the program
    opt_level : int
        The optimization level of the compiler
    pass_config : Optional[Dict[str, Any]]
        The pass config of the compiler
    disabled_pass : Optional[List[str]]
        The list of disabled passes of the compiler

    Returns
    -------
    tasks: List[ExtractedTask]
        The tasks extracted from this network
    """
    # pylint: disable=import-outside-toplevel
    from tvm.relay import Function as RelayFunc

    # pylint: enable=import-outside-toplevel

    extract_task_func = get_global_func(
        "relay.backend.MetaScheduleExtractTask",
        allow_missing=False,
    )

    if isinstance(mod, RelayFunc):
        mod = IRModule.from_expr(mod)
    if not isinstance(target, Target):
        target = Target(target)
    if disabled_pass is None:
        disabled_pass = []
    if pass_config is None:
        pass_config = {"relay.backend.use_meta_schedule": True}
    if params is None:
        params = {}
    relay_params = {}
    for name, param in params.items():
        if isinstance(param, np.ndarray):
            param = nd.array(param)
        relay_params[name] = param

    with autotvm_silencer(), target, transform.PassContext(
        opt_level=opt_level,
        config=pass_config,
        disabled_pass=disabled_pass,
    ):
        return list(extract_task_func(mod, target, relay_params))

Esempio n. 11

0

Mostra file

File: relay_integration.py Progetto: chenghanpeng/tvm

def extract_task_from_relay(
    mod: IRModule,
    target: Target,
    params: Optional[Dict[str, NDArray]] = None,
    *,
    opt_level: int = 3,
    pass_config: Optional[Dict[str, Any]] = None,
    disabled_pass: Optional[List[str]] = None,
    te_filter_func: Union[str, None, Callable[[List[Tensor]],
                                              PrimFunc]] = None,
) -> List[ExtractedTask]:
    """Extract tuning tasks from a relay program.

    Parameters
    ----------
    mod : IRModule
        The module or function to tune
    target : tvm.target.Target
        The compilation target
    params : Optional[Dict[str, tvm.runtime.NDArray]]
        The associated parameters of the program
    opt_level : int
        The optimization level of the compiler
    pass_config : Optional[Dict[str, Any]]
        The pass config of the compiler
    disabled_pass : Optional[List[str]]
        The list of disabled passes of the compiler
    te_filter_func : Callable[[List[tvm.te.Tensor]], bool]
        The filter function to filter out the extracted tasks
        If it's a string, it's the name of the filtering function. Built in functions are
          - "meta_schedule.DefaultTaskFilter"
          - "meta_schedule.DefaultTaskFilterAllowExtern"
        If it's None, it's the default filtering function
        If it's a callable, it's the filtering function

    Returns
    -------
    tasks: List[ExtractedTask]
        The tasks extracted from this network
    """
    # pylint: disable=import-outside-toplevel
    from tvm import autotvm
    from tvm.relay import Function as RelayFunc

    # pylint: enable=import-outside-toplevel

    if isinstance(te_filter_func, str):
        te_filter_func = get_global_func(te_filter_func)
    extract_task_func = get_global_func(
        "relay.backend.MetaScheduleExtractTask",
        allow_missing=False,
    )

    if isinstance(mod, RelayFunc):
        mod = IRModule.from_expr(mod)
    if not isinstance(target, Target):
        target = Target(target)
    if disabled_pass is None:
        disabled_pass = []
    if pass_config is None:
        pass_config = {"relay.backend.use_meta_schedule": True}
    if params is None:
        params = {}
    relay_params = {}
    for name, param in params.items():
        if isinstance(param, np.ndarray):
            param = nd.array(param)
        relay_params[name] = param

    with target, autotvm_silencer(), transform.PassContext(
            opt_level=opt_level,
            config=pass_config,
            disabled_pass=disabled_pass,
    ):
        if target.kind.name != "cuda" and isinstance(
                autotvm.DispatchContext.current, autotvm.FallbackContext):
            tophub_context = autotvm.tophub.context(target)
        else:
            tophub_context = autotvm.utils.EmptyContext()
        with tophub_context:
            return list(
                extract_task_func(mod, target, relay_params, te_filter_func))

Esempio n. 12

0

Mostra file

def run_through_rpc(
    measure_input,
    build_result,
    number,
    repeat,
    min_repeat_ms,
    cooldown_interval,
    remote_args,
    ref_input=None,
    ref_output=None,
    enable_cpu_cache_flush=False,
):
    """Run a generated library through rpc

    Parameters
    ----------
    measure_input: MeasureInput
        The raw measure input
    build_result: BuildResult
        The result returned from Builder. This contains the path to the generated library.
    number: int
        The number of times to run the generated code for taking average.
        We call these runs as one `repeat` of measurement.
    repeat : int, optional
        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
        where the first one is warm up and will be discarded.
        The returned result contains `repeat` costs,
        each of which is an average of `number` costs.
    min_repeat_ms: int, optional
        The minimum duration of one `repeat` in milliseconds.
        By default, one `repeat` contains `number` runs. If this parameter is set,
        the parameters `number` will be dynamically adjusted to meet the
        minimum duration requirement of one `repeat`.
        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
        will be automatically increased.
    cooldown_interval: float
        The cool down interval between two measurements
    remote_args: Tuple
        The argument for request_remote
    ref_input: List of np.ndarray
        The reference input used for checking correctness
    ref_output: List of np.ndarray
        The reference output used for checking correctness
    enable_cpu_cache_flush: bool
        Whether to flush cache on CPU between repeated measurements.
        Flushing cache can make the measured latency of one operator closer to
        its actual latency during end-to-end inference.
        To make this option effective, the argument `number` should also be set to 1.
        This is only has effect on CPU task.
    """
    if isinstance(build_result, MeasureResult):
        return build_result

    tic = time.time()
    errno = MeasureErrorNo.NO_ERROR
    try:
        # upload built module
        remote = request_remote(*remote_args)
        # Program the FPGA every single time when targeting VTA
        if (
            hasattr(measure_input.target, "device_name")
            and measure_input.target.device_name == "vta"
        ):
            # pylint: disable=import-outside-toplevel
            from vta import program_fpga, reconfig_runtime

            program_fpga(remote, None)
            reconfig_runtime(remote)
        remote.upload(build_result.filename)
        func = remote.load_module(os.path.split(build_result.filename)[1])
        ctx = remote.context(str(measure_input.target), 0)

        # Limitation:
        # We can not get PackFunction directly in the remote mode as it is wrapped
        # under the std::function. We could lift the restriction later once we fold
        # the PackedFunc as an object. Currently, we pass function name to work
        # around it.
        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
        time_f = func.time_evaluator(
            func.entry_name,
            ctx,
            number=number,
            repeat=repeat,
            min_repeat_ms=min_repeat_ms,
            f_preproc=f_prepare,
        )

        # set input
        if ref_input:
            args = [nd.array(x, ctx=ctx) for x in ref_input]
        else:
            try:
                random_fill = remote.get_function("tvm.contrib.random.random_fill")
            except AttributeError:
                raise AttributeError(
                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
                )
            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
            for arg in args:
                random_fill(arg)
            ctx.sync()

        costs = time_f(*args).results

        # clean up remote files
        remote.remove(build_result.filename)
        remote.remove(os.path.splitext(build_result.filename)[0] + ".so")
        remote.remove("")

        if len(costs) > 2:  # remove largest and smallest value to reduce variance
            costs = list(costs)
            costs.sort()
            costs = tuple(costs[1:-1])

        # check correctness of output
        if ref_output:
            for expected, real in zip(ref_output, args):
                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
                    logger.warning("Wrong Answer!")
                    errno = MeasureErrorNo.WRONG_ANSWER
    except TVMError as exc:
        msg = str(exc)
        if "Stack trace returned" in msg:
            msg = msg[: msg.index("Stack trace returned")]
        if "CUDA Source" in msg:
            msg = msg[: msg.index("CUDA Source")]
        costs = (RuntimeError(msg[:1024]),)
        errno = MeasureErrorNo.RUNTIME_DEVICE
    tstamp = time.time()
    time.sleep(cooldown_interval)
    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)

Esempio n. 13

0

Mostra file

def run_through_rpc(
    measure_input,
    build_result,
    number,
    repeat,
    min_repeat_ms,
    cooldown_interval,
    remote_kwargs,
    ref_input,
    enable_cpu_cache_flush=False,
    module_loader=None,
):
    """Run a generated library through rpc

    Parameters
    ----------
    measure_input: MeasureInput
        The raw measure input
    build_result: BuildResult
        The result returned from Builder. This contains the path to the generated library.
    number: int
        The number of times to run the generated code for taking average.
        We call these runs as one `repeat` of measurement.
    repeat : int, optional
        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
        where the first one is warm up and will be discarded.
        The returned result contains `repeat` costs,
        each of which is an average of `number` costs.
    min_repeat_ms: int, optional
        The minimum duration of one `repeat` in milliseconds.
        By default, one `repeat` contains `number` runs. If this parameter is set,
        the parameters `number` will be dynamically adjusted to meet the
        minimum duration requirement of one `repeat`.
        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
        will be automatically increased.
    cooldown_interval: float
        The cool down interval between two measurements
    remote_kwargs: dict
        Passed to module_loader(). Ultimately, keyword args to request_remote().
    ref_input: List of np.ndarray
        The reference input used for tuning. Empty for randomly filled input.
    enable_cpu_cache_flush: bool
        Whether to flush cache on CPU between repeated measurements.
        Flushing cache can make the measured latency of one operator closer to
        its actual latency during end-to-end inference.
        To make this option effective, the argument `number` should also be set to 1.
        This is only has effect on CPU task.
    module_loader: ModuleLoader
        A function that returns a ContextManager used to establish and teardown the remote session.
    """
    if isinstance(build_result, MeasureResult):
        return build_result

    tic = time.time()
    errno = MeasureErrorNo.NO_ERROR
    try:
        # upload built module
        with module_loader(remote_kwargs, build_result) as (remote, mod):
            dev = remote.device(str(measure_input.target), 0)

            # Limitation:
            # We can not get PackFunction directly in the remote mode as it is wrapped
            # under the std::function. We could lift the restriction later once we fold
            # the PackedFunc as an object. Currently, we pass function name to work
            # around it.
            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
            time_f = mod.time_evaluator(
                mod.entry_name,
                dev,
                number=number,
                repeat=repeat,
                min_repeat_ms=min_repeat_ms,
                f_preproc=f_prepare,
            )

            if ref_input:
                args = [nd.array(x, device=dev) for x in ref_input]
            else:
                try:
                    random_fill = remote.get_function(
                        "tvm.contrib.random.random_fill")
                except AttributeError:
                    raise AttributeError(
                        "Please make sure USE_RANDOM is ON in the config.cmake "
                        "on the remote devices")
                args = [
                    nd.empty(x[0], x[1], dev) for x in build_result.arg_info
                ]
                if "scatter" not in measure_input.task.name:
                    # the index tensor of scatter op cannot be randomly initialized
                    for arg in args:
                        random_fill(arg)
                dev.sync()

            costs = time_f(*args).results

        if len(costs
               ) > 2:  # remove largest and smallest value to reduce variance
            costs = list(costs)
            costs.sort()
            costs = tuple(costs[1:-1])
    except TVMError as exc:
        msg = str(exc)
        if "Stack trace returned" in msg:
            msg = msg[:msg.index("Stack trace returned")]
        if "CUDA Source" in msg:
            msg = msg[:msg.index("CUDA Source")]
        costs = (RuntimeError(msg[:1024]), )
        errno = MeasureErrorNo.RUNTIME_DEVICE
    tstamp = time.time()
    time.sleep(cooldown_interval)
    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost,
                         tstamp)