def warmup_estimate(ins, outs): ins = [runtime.array(x, ctx) for x in ins] outs = [runtime.array(x, ctx) for x in outs] tensors = ins + outs func(*tensors) runtime.gpu(visible_dev_id).sync() t_start = time.time() func(*tensors) runtime.gpu(visible_dev_id).sync() t_diff = time.time() - t_start return ins, outs, tensors, t_diff
def compile_cuda(code, target="ptx", arch=None, options=None, path_target=None): """Compile cuda code with NVCC from env. Parameters ---------- code : str The cuda code. target : str The target format arch : str The architecture options : str or list of str The additional options path_target : str, optional Output file. Return ------ cubin : bytearray The bytearray of the cubin """ temp = util.tempdir() if target not in ["cubin", "ptx", "fatbin"]: raise ValueError("target must be in cubin, ptx, fatbin") temp_code = temp.relpath("my_kernel.cu") temp_target = temp.relpath("my_kernel.%s" % target) with open(temp_code, "w") as out_file: out_file.write(code) if arch is None: if nd.gpu(0).exist: # auto detect the compute arch argument arch = "sm_" + "".join(nd.gpu(0).compute_version.split('.')) else: raise ValueError( "arch(sm_xy) is not passed, and we cannot detect it from env") file_target = path_target if path_target else temp_target cmd = ["nvcc"] cmd += ["--%s" % target, "-O3"] if isinstance(arch, list): cmd += arch else: cmd += ["-arch", arch] if options: if isinstance(options, str): cmd += [options] elif isinstance(options, list): cmd += options else: raise ValueError("options must be str or list of str") cmd += ["-o", file_target] cmd += [temp_code] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() if proc.returncode != 0: msg = code msg += "\nCompilation error:\n" msg += py_str(out) raise RuntimeError(msg) data = bytearray(open(file_target, "rb").read()) if not data: raise RuntimeError("Compilation error: empty result is generated") return data
def compile_cuda(code, target="ptx", arch=None, options=None, path_target=None): """Compile cuda code with NVCC from env. Parameters ---------- code : str The cuda code. target : str The target format arch : str The architecture options : str or list of str The additional options path_target : str, optional Output file. Return ------ cubin : bytearray The bytearray of the cubin """ temp = utils.tempdir() if target not in ["cubin", "ptx", "fatbin"]: raise ValueError("target must be in cubin, ptx, fatbin") temp_code = temp.relpath("my_kernel.cu") temp_target = temp.relpath("my_kernel.%s" % target) with open(temp_code, "w") as out_file: out_file.write(code) if arch is None: if nd.gpu(0).exist: # auto detect the compute arch argument arch = "sm_" + "".join(nd.gpu(0).compute_version.split(".")) else: raise ValueError("arch(sm_xy) is not passed, and we cannot detect it from env") file_target = path_target if path_target else temp_target cmd = ["nvcc"] cmd += ["--%s" % target, "-O3"] if isinstance(arch, list): cmd += arch else: cmd += ["-arch", arch] if options: if isinstance(options, str): cmd += [options] elif isinstance(options, list): cmd += options else: raise ValueError("options must be str or list of str") cmd += ["-o", file_target] cmd += [temp_code] # NOTE: ccbin option can be used to tell nvcc where to find the c++ compiler # just in case it is not in the path. On Windows it is not in the path by default. # However, we cannot use TVM_CXX_COMPILER_PATH because the runtime env. # Because it is hard to do runtime compiler detection, we require nvcc is configured # correctly by default. # if cxx_compiler_path != "": # cmd += ["-ccbin", cxx_compiler_path] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() if proc.returncode != 0: msg = code msg += "\nCompilation error:\n" msg += py_str(out) raise RuntimeError(msg) data = bytearray(open(file_target, "rb").read()) if not data: raise RuntimeError("Compilation error: empty result is generated") return data