Beispiel #1
0
def profile_and_build_vm(
    mod,
    params,
    sm,
    split_k_slices=[1],
    tmp_dir="./tmp",
    lib_path="compile.so",
    vmcode_path="vmcode.ro",
    use_fast_math=False,
    use_3xtf32=True,
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod,
        sm,
        split_k_slices=split_k_slices,
        use_3xtf32=use_3xtf32,
        profile_all_alignments=False,
        find_first_valid=True,
        tmp_dir=tmp_dir,
    )
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
    vm_exec = build_cutlass_kernels_vm(vm_exec,
                                       sm,
                                       tmp_dir,
                                       lib_path,
                                       vmcode_path,
                                       use_fast_math=use_fast_math)
    dev = tvm.device("cuda", 0)
    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
Beispiel #2
0
def profile_and_build(
    mod,
    params,
    sm,
    split_k_slices=[1],
    tmp_dir="./tmp",
    lib_path="compile.so",
    use_fast_math=False,
    use_3xtf32=True,
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod,
        sm,
        use_3xtf32=use_3xtf32,
        split_k_slices=split_k_slices,
        profile_all_alignments=False,
        find_first_valid=True,
        use_multiprocessing=True,
        tmp_dir=tmp_dir,
    )
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target="cuda", params=params)
    lib = build_cutlass_kernels(lib,
                                sm,
                                tmp_dir,
                                lib_path,
                                use_fast_math=use_fast_math)
    dev = tvm.device("cuda", 0)
    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
    return rt_mod, dev, num_cutlass_partition
Beispiel #3
0
def profile_and_build_vm(
    mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro"
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir)
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
    vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path)
    dev = tvm.device("cuda", 0)
    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
Beispiel #4
0
def profile_and_build(mod, params, sm, tmp_dir="./tmp", lib_path="compile.so"):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod, sm, profile_all=False, use_multiprocessing=False, tmp_dir=tmp_dir)
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target="cuda", params=params)
    lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path)
    dev = tvm.device("cuda", 0)
    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
    return rt_mod, dev, num_cutlass_partition
    return lib


###########################################################
# Run the two subgraphs in pipeline with pipeline executor.
# ---------------------------------------------------------
# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build

#########################################
# Create subgraph pipeline configuration.
# Associate a subgraph module with a target.
# Use CUTLASS BYOC to build the second subgraph module.
mod0, mod1 = subgraphs[0], subgraphs[1]
# Use cutlass as the codegen.
mod1 = partition_for_cutlass(mod1)
#################################################
# Get the pipeline executor configuration object.
pipe_config = pipeline_executor_build.PipelineConfig()
###########################################################################
# Set the compile target of the subgraph module.
pipe_config[mod0].target = "llvm"
pipe_config[mod0].dev = tvm.cpu(0)
##############################################################
# Set the compile target of the second subgraph module as cuda.
pipe_config[mod1].target = "cuda"
pipe_config[mod1].dev = tvm.device("cuda", 0)
pipe_config[mod1].build_func = cutlass_build
pipe_config[mod1].export_cc = "nvcc"
# Create the pipeline by connecting the subgraph modules.
# The global input will be forwarded to the input interface of the first module named mod0