def profile_and_build( mod, params, sm, split_k_slices=[1], tmp_dir="./tmp", lib_path="compile.so", use_fast_math=False, use_3xtf32=True, ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, use_3xtf32=use_3xtf32, split_k_slices=split_k_slices, profile_all_alignments=False, find_first_valid=True, use_multiprocessing=True, tmp_dir=tmp_dir, ) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target="cuda", params=params) lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path, use_fast_math=use_fast_math) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) return rt_mod, dev, num_cutlass_partition
def profile_and_build(mod, params, sm, tmp_dir="./tmp", lib_path="compile.so"): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, profile_all=False, use_multiprocessing=False, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target="cuda", params=params) lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) return rt_mod, dev, num_cutlass_partition