Esempio n. 1
def schedule_softmax(outs):
    """Schedule for softmax op.

    outs: Array of Tensor
          The computation graph description of softmax in the format
          of an array of tensors.

    sch: Schedule
        The computation schedule for the op.
    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
    s = te.create_schedule([x.op for x in outs])
    softmax = outs[0]
    tgt = Target.current(allow_none=False)

    op_tag = softmax.op.tag
    if op_tag == "softmax_output":
        expsum = softmax.op.input_tensors[1]
        exp = softmax.op.input_tensors[0]
        max_elem = s[exp].op.input_tensors[1]
    elif op_tag == "log_softmax_output":
        exp = None
        max_elem = softmax.op.input_tensors[1]
        expsum = softmax.op.input_tensors[2]
        raise ValueError(
            "Tag is expected to be softmax_output or log_softmax_output. \
                         Got {0}".format(

    # The nvptx and rocm backends only supports 32-bits warp shuffle
    # instructions.
    # TODO(tvm-team) Fix nvptx codegen or deprecate nvptx backend.
    def sched_warp_softmax():
        if == "nvptx" or == "rocm":
            return softmax.dtype == "float32" or softmax.dtype == "int32"
        if != "cuda":
            # this is used as the gpu schedule for other arches which may not have warp reductions
            return False
        return True

    if len(softmax.shape) > 2:
        ops = [max_elem.op, expsum.op, softmax.op]
        if exp is not None:

        for op in ops:
            s = schedule_injective_from_existing(s, op.output(0))

    elif sched_warp_softmax():
        # A warp of 32 threads performs a row reduction.
        num_thread = tgt.thread_warp_size
        block_x = te.thread_axis("blockIdx.x")
        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")

        # (4) softmax
        xo, xi = s[softmax].split(softmax.op.axis[1], nparts=num_thread)
        _, xii = s[softmax].split(xi, factor=4)
        s[softmax].bind(xo, thread_x)
        s[softmax].bind(softmax.op.axis[0], block_x)

        # (3) expsum
        k = expsum.op.reduce_axis[0]
        ko, _ = s[expsum].split(k, nparts=num_thread)
        s[expsum].bind(ko, thread_x)
        s[expsum].compute_at(s[softmax], xo)

        # (2) exp
        if exp is not None:
            xo, xi = s[exp].split(exp.op.axis[1], nparts=num_thread)
            _, xii = s[exp].split(xi, factor=4)
            s[exp].bind(xo, thread_x)
            s[exp].compute_at(s[expsum], expsum.op.axis[0])
            s[exp].compute_at(s[softmax], softmax.op.axis[0])

        # (1) max_elem
        k = max_elem.op.reduce_axis[0]
        ko, _ = s[max_elem].split(k, nparts=num_thread)
        s[max_elem].bind(ko, thread_x)
        if exp is not None:
            s[max_elem].compute_at(s[exp], xo)
            s[max_elem].bind(ko, thread_x)
            s[max_elem].bind(max_elem.op.axis[0], block_x)

        num_thread = 64
        block_x = te.thread_axis("blockIdx.x")
        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")

        if exp is not None:
            s[exp].bind(exp.op.axis[0], block_x)

        s[max_elem].bind(max_elem.op.axis[0], block_x)
        k = expsum.op.reduce_axis[0]
        ko, ki = s[expsum].split(k, factor=num_thread)
        EF = s.rfactor(expsum, ki)
        s[expsum].bind(s[expsum].op.axis[0], block_x)
        s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x)
        s[EF].compute_at(s[expsum], s[expsum].op.reduce_axis[0])
        tx, xi = s[softmax].split(softmax.op.axis[1], nparts=num_thread)
        s[softmax].bind(softmax.op.axis[0], block_x)
        s[softmax].bind(tx, thread_x)

    return s
Esempio n. 2
def compile_cuda(code,
    """Compile cuda code with NVCC from env.

    code : str
        The cuda code.

    target_format : str
        The target format of nvcc compiler.

    arch : str
        The cuda architecture.

    options : str or list of str
        The additional options.

    path_target : str, optional
        Output file.

    cubin : bytearray
        The bytearray of the cubin
    if arch is None:
        # If None, then it will use ``.
        # Target arch could be a str like "sm_xx", or a list, such as
        # [
        #   "-gencode", "arch=compute_52,code=sm_52",
        #   "-gencode", "arch=compute_70,code=sm_70"
        # ]
        compute_version = "".join(
        arch = [

    temp = utils.tempdir()
    if target_format not in ["cubin", "ptx", "fatbin"]:
        raise ValueError("target_format must be in cubin, ptx, fatbin")
    temp_code = temp.relpath("")
    temp_target = temp.relpath("my_kernel.%s" % target_format)

    with open(temp_code, "w") as out_file:

    file_target = path_target if path_target else temp_target
    cmd = ["nvcc"]
    cmd += ["--%s" % target_format, "-O3"]
    if isinstance(arch, list):
        cmd += arch
    elif isinstance(arch, str):
        cmd += ["-arch", arch]

    if options:
        if isinstance(options, str):
            cmd += [options]
        elif isinstance(options, list):
            cmd += options
            raise ValueError("options must be str or list of str")

    cmd += ["-o", file_target]
    cmd += [temp_code]

    # NOTE: ccbin option can be used to tell nvcc where to find the c++ compiler
    # just in case it is not in the path. On Windows it is not in the path by default.
    # However, we cannot use TVM_CXX_COMPILER_PATH because the runtime env.
    # Because it is hard to do runtime compiler detection, we require nvcc is configured
    # correctly by default.
    # if cxx_compiler_path != "":
    #    cmd += ["-ccbin", cxx_compiler_path]

    proc = subprocess.Popen(cmd,

    (out, _) = proc.communicate()

    if proc.returncode != 0:
        msg = code
        msg += "\nCompilation error:\n"
        msg += py_str(out)
        raise RuntimeError(msg)

    data = bytearray(open(file_target, "rb").read())
    if not data:
        raise RuntimeError("Compilation error: empty result is generated")
    return data
Esempio n. 3
def build(
    inputs: Union[schedule.Schedule, PrimFunc, IRModule, Mapping[str,
    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
    target: Optional[Union[str, Target]] = None,
    target_host: Optional[Union[str, Target]] = None,
    name: Optional[str] = "default_function",
    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
    """Build a function with arguments as signature. Code will be generated
    for devices coupled with target information.

    inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]]
        The input to be built

    args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, Var]]]
        The argument lists to the function.

    target : Optional[Union[str, Target]]
        The target and option of the compilation.

    target_host : Optional[Union[str, Target]]
        Host compilation target, if target is device.
        When TVM compiles device specific program such as CUDA,
        we also need host(CPU) side code to interact with the driver
        setup the dimensions and parameters correctly.
        target_host is used to specify the host side codegen target.
        By default, llvm is used if it is enabled,
        otherwise a stackvm intepreter is used.

    name : Optional[str]
        The name of result function.

    binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]]
        Dictionary that maps the binding of symbolic buffer to Tensor.
        By default, a new buffer is created for each tensor in the argument.

    ret : tvm.module
        A module that combines both host and device code.

    There are two typical example uses of this function depending on the type
    of the argument `inputs`:
    1. it is an IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s = tvm.te.create_schedule(C.op)
        m = tvm.lower(s, [A, B, C], name="test_add")
        rt_mod =, target="llvm")

    2. it is a dict of compilation target to IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s1 = tvm.te.create_schedule(C.op)
        with as cuda_tgt:
          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
          m1 = tvm.lower(s1, [A, B, C], name="test_add1")
          m2 = tvm.lower(s2, [A, B, C], name="test_add2")
          rt_mod ={"llvm": m1, "cuda": m2}, target_host="llvm")

    See the note on :any:`` on target string format.
    if isinstance(inputs, schedule.Schedule):
        if args is None:
            raise ValueError("args must be given for build from schedule")
        input_mod = lower(inputs, args, name=name, binds=binds)
    elif isinstance(inputs, (list, tuple, container.Array)):
        merged_mod = tvm.IRModule({})
        for x in inputs:
        input_mod = merged_mod
    elif isinstance(inputs, (tvm.IRModule, PrimFunc)):
        input_mod = lower(inputs)
    elif not isinstance(inputs, (dict, container.Map)):
        raise ValueError(
            f"Inputs must be Schedule, IRModule or dict of target to IRModule, "
            f"but got {type(inputs)}.")

    if not isinstance(inputs, (dict, container.Map)):
        target = Target.current() if target is None else target
        target = target if target else "llvm"
        target_input_mod = {target: input_mod}
        target_input_mod = inputs

    for tar, mod in target_input_mod.items():
        if not isinstance(tar, (str, Target)):
            raise ValueError("The key of inputs must be str or "
                             "Target when inputs is dict.")
        if not isinstance(mod, tvm.IRModule):
            raise ValueError("inputs must be Schedule, IRModule,"
                             "or dict of str to IRModule.")

    target_input_mod, target_host = Target.check_and_update_host_consist(
        target_input_mod, target_host)

    if not target_host:
        for tar, mod in target_input_mod.items():
            tar = Target(tar)
            device_type = ndarray.device(, 0).device_type
            if device_type == ndarray.cpu(0).device_type:
                target_host = tar
    if not target_host:
        target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"

    target_input_mod, target_host = Target.check_and_update_host_consist(
        target_input_mod, target_host)

    mod_host_all = tvm.IRModule({})

    device_modules = []
    for tar, input_mod in target_input_mod.items():
        mod_host, mdev = _build_for_device(input_mod, tar, target_host)

    # Generate a unified host module.
    rt_mod_host = codegen.build_module(mod_host_all, target_host)

    # Import all modules.
    for mdev in device_modules:
        if mdev:

    if not isinstance(target_host, Target):
        target_host = Target(target_host)
    if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c"
            and target_host.attrs.get("system-lib", 0) == 1):
        if == "c":
            create_csource_crt_metadata_module = tvm._ffi.get_global_func(
            to_return = create_csource_crt_metadata_module([rt_mod_host],

        elif == "llvm":
            create_llvm_crt_metadata_module = tvm._ffi.get_global_func(
            to_return = create_llvm_crt_metadata_module([rt_mod_host],
        to_return = rt_mod_host

    return OperatorModule.from_module(to_return,
Esempio n. 4
def build(inputs,
    """Build a function with arguments as signature. Code will be generated
    for devices coupled with target information.

    inputs : tvm.te.Schedule, IRModule, or dict of target to IRModule
        The schedule to be built

    args : list of Buffer or Tensor or Var, optional
        The argument lists to the function.

    target : str or :any:``, optional
        The target and option of the compilation.

    target_host : str or :any:`` optional
        Host compilation target, if target is device.
        When TVM compiles device specific program such as CUDA,
        we also need host(CPU) side code to interact with the driver
        setup the dimensions and parameters correctly.
        target_host is used to specify the host side codegen target.
        By default, llvm is used if it is enabled,
        otherwise a stackvm intepreter is used.

    name : str, optional
        The name of result function.

    binds : dict, optional
        Dictionary that maps the binding of symbolic buffer to Tensor.
        By default, a new buffer is created for each tensor in the argument.

    ret : tvm.module
        A module that combines both host and device code.

    There are two typical example uses of this function depending on the type
    of the argument `inputs`:
    1. it is an IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s = tvm.te.create_schedule(C.op)
        m = tvm.lower(s, [A, B, C], name="test_add")
        rt_mod =, target="llvm")

    2. it is a dict of compilation target to IRModule.

    .. code-block:: python

        n = 2
        A = te.placeholder((n,), name='A')
        B = te.placeholder((n,), name='B')
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
        s1 = tvm.te.create_schedule(C.op)
        with as cuda_tgt:
          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
          m1 = tvm.lower(s1, [A, B, C], name="test_add1")
          m2 = tvm.lower(s2, [A, B, C], name="test_add2")
          rt_mod ={"llvm": m1, "cuda": m2}, target_host="llvm")

    See the note on :any:`` on target string format.
    if isinstance(inputs, schedule.Schedule):
        if args is None:
            raise ValueError("args must be given for build from schedule")
        input_mod = lower(inputs, args, name=name, binds=binds)
    elif isinstance(inputs, (list, tuple, container.Array)):
        merged_mod = tvm.IRModule({})
        for x in inputs:
        input_mod = merged_mod
    elif isinstance(inputs, tvm.IRModule):
        input_mod = inputs
    elif not isinstance(inputs, (dict, container.Map)):
        raise ValueError(
            "inputs must be Schedule, IRModule or dict of target to IRModule")

    if not isinstance(inputs, (dict, container.Map)):
        target = Target.current() if target is None else target
        target = target if target else "llvm"
        target_input_mod = {target: input_mod}
        target_input_mod = inputs

    for tar, mod in target_input_mod.items():
        if not isinstance(tar, (str, Target)):
            raise ValueError("The key of inputs must be str or "
                             "Target when inputs is dict.")
        if not isinstance(mod, tvm.IRModule):
            raise ValueError("inputs must be Schedule, IRModule,"
                             "or dict of str to IRModule.")

    if not target_host:
        for tar, _ in target_input_mod.items():
            tar = Target(tar)
            device_type = ndarray.context(, 0).device_type
            if device_type == ndarray.cpu(0).device_type:
                target_host = tar
    if not target_host:
        target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"

    mod_host_all = tvm.IRModule({})

    device_modules = []
    for tar, input_mod in target_input_mod.items():
        mod_host, mdev = _build_for_device(input_mod, tar, target_host)

    # Generate a unified host module.
    rt_mod_host = codegen.build_module(mod_host_all, target_host)

    # Import all modules.
    for mdev in device_modules:
        if mdev:
    return rt_mod_host
Esempio n. 5
File: Progetto: a1nc/tvm
    def _get_node_default_compute_dtype():
        target = Target.current(True)
        if target and str(target.kind) == "llvm" and target_has_sse41(target.mcpu):
            return "float32"

        return "int64"