def compile_with_cache(source, options=(), arch=None, cache_dir=None): global _empty_file_preprocess_cache if cache_dir is None: cache_dir = get_cache_dir() if arch is None: arch = _get_arch() options += ('-ftz=true',) env = (arch, options, _get_nvrtc_version()) if '#include' in source: pp_src = '%s %s' % (env, _preprocess(source, options)) else: base = _empty_file_preprocess_cache.get(env, None) if base is None: base = _empty_file_preprocess_cache[env] = _preprocess('', options) pp_src = '%s %s %s' % (env, base, source) pp_src = pp_src.encode('utf-8') name = '%s_2.cubin' % hashlib.md5(pp_src).hexdigest() if not os.path.isdir(cache_dir): try: os.makedirs(cache_dir) except OSError: if not os.path.isdir(cache_dir): raise mod = function.Module() # To handle conflicts in concurrent situation, we adopt lock-free method # to avoid performance degradation. path = os.path.join(cache_dir, name) if os.path.exists(path): with open(path, 'rb') as file: data = file.read() if len(data) >= 32: hash = data[:32] cubin = data[32:] cubin_hash = six.b(hashlib.md5(cubin).hexdigest()) if hash == cubin_hash: mod.load(cubin) return mod ptx = compile_using_nvrtc(source, options, arch) ls = function.LinkState() ls.add_ptr_data(ptx, six.u('cupy.ptx')) cubin = ls.complete() cubin_hash = six.b(hashlib.md5(cubin).hexdigest()) # shutil.move is not atomic operation, so it could result in a corrupted # file. We detect it by appending md5 hash at the beginning of each cache # file. If the file is corrupted, it will be ignored next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(cubin_hash) tf.write(cubin) temp_path = tf.name shutil.move(temp_path, path) mod.load(cubin) return mod
def _cuda_get_module(self): if SRUFunction._cuda_module is not None: return SRUFunction._cuda_module SRUFunction._cuda_module = function.Module() if cupy_version == 1: SRUFunction._cuda_module.load(CUDA_SRU_PTX) return SRUFunction._cuda_module if cupy_version == 2: ls = function.LinkState() ls.add_ptr_data(CUDA_SRU_PTX, u"cupy.ptx") SRUFunction._cuda_module.load(ls.complete()) return SRUFunction._cuda_module raise NotImplementedError()
def _compile_with_cache_cuda( source, options, arch, cache_dir, extra_source=None, backend='nvrtc', enable_cooperative_groups=False): # NVRTC does not use extra_source. extra_source is used for cache key. global _empty_file_preprocess_cache if cache_dir is None: cache_dir = get_cache_dir() if arch is None: arch = _get_arch() options += ('-ftz=true',) if enable_cooperative_groups: # `cooperative_groups` requires `-rdc=true`. # The three latter flags are to resolve linker error. # (https://devtalk.nvidia.com/default/topic/1023604/linker-error/) options += ('-rdc=true', '-Xcompiler', '-fPIC', '-shared') if _get_bool_env_variable('CUPY_CUDA_COMPILE_WITH_DEBUG', False): options += ('--device-debug', '--generate-line-info') env = (arch, options, _get_nvrtc_version(), backend) base = _empty_file_preprocess_cache.get(env, None) if base is None: # This is checking of NVRTC compiler internal version base = _preprocess('', options, arch, backend) _empty_file_preprocess_cache[env] = base key_src = '%s %s %s %s' % (env, base, source, extra_source) key_src = key_src.encode('utf-8') name = '%s_2.cubin' % hashlib.md5(key_src).hexdigest() if not os.path.isdir(cache_dir): try: os.makedirs(cache_dir) except OSError: if not os.path.isdir(cache_dir): raise mod = function.Module() # To handle conflicts in concurrent situation, we adopt lock-free method # to avoid performance degradation. path = os.path.join(cache_dir, name) if os.path.exists(path): with open(path, 'rb') as file: data = file.read() if len(data) >= 32: hash = data[:32] cubin = data[32:] cubin_hash = six.b(hashlib.md5(cubin).hexdigest()) if hash == cubin_hash: mod.load(cubin) return mod if backend == 'nvrtc': ptx = compile_using_nvrtc(source, options, arch, name + '.cu') ls = function.LinkState() ls.add_ptr_data(ptx, 'cupy.ptx') # for separate compilation if _is_cudadevrt_needed(options): global _cudadevrt if _cudadevrt is None: _cudadevrt = _get_cudadevrt_path() ls.add_ptr_file(_cudadevrt) cubin = ls.complete() elif backend == 'nvcc': rdc = _is_cudadevrt_needed(options) cubin = compile_using_nvcc(source, options, arch, name + '.cu', code_type='cubin', separate_compilation=rdc) else: raise ValueError('Invalid backend %s' % backend) cubin_hash = six.b(hashlib.md5(cubin).hexdigest()) # shutil.move is not atomic operation, so it could result in a corrupted # file. We detect it by appending md5 hash at the beginning of each cache # file. If the file is corrupted, it will be ignored next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(cubin_hash) tf.write(cubin) temp_path = tf.name shutil.move(temp_path, path) # Save .cu source file along with .cubin if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False): with open(path + '.cu', 'w') as f: f.write(source) mod.load(cubin) return mod
def _compile_with_cache_cuda( source, options, arch, cache_dir, extra_source=None, backend='nvrtc', enable_cooperative_groups=False, name_expressions=None, log_stream=None, cache_in_memory=False, jitify=False): # NVRTC does not use extra_source. extra_source is used for cache key. global _empty_file_preprocess_cache if cache_dir is None: cache_dir = get_cache_dir() if arch is None: arch = _get_arch() options += ('-ftz=true',) if enable_cooperative_groups: # `cooperative_groups` requires relocatable device code. options += ('--device-c',) if _get_bool_env_variable('CUPY_CUDA_COMPILE_WITH_DEBUG', False): options += ('--device-debug', '--generate-line-info') is_jitify_requested = ('-DCUPY_USE_JITIFY' in options) if jitify and not is_jitify_requested: # jitify is set in RawKernel/RawModule, translate it to an option # that is useless to the compiler, but can be used as part of the # hash key options += ('-DCUPY_USE_JITIFY',) elif is_jitify_requested and not jitify: # jitify is requested internally, just set the flag jitify = True if jitify and backend != 'nvrtc': raise ValueError('jitify only works with NVRTC') env = (arch, options, _get_nvrtc_version(), backend) base = _empty_file_preprocess_cache.get(env, None) if base is None: # This is for checking NVRTC/NVCC compiler internal version base = _preprocess('', options, arch, backend) _empty_file_preprocess_cache[env] = base key_src = '%s %s %s %s' % (env, base, source, extra_source) key_src = key_src.encode('utf-8') name = '%s_2.cubin' % hashlib.md5(key_src).hexdigest() mod = function.Module() if not cache_in_memory: # Read from disk cache if not os.path.isdir(cache_dir): os.makedirs(cache_dir, exist_ok=True) # To handle conflicts in concurrent situation, we adopt lock-free # method to avoid performance degradation. # We force recompiling to retrieve C++ mangled names if so desired. path = os.path.join(cache_dir, name) if os.path.exists(path) and not name_expressions: with open(path, 'rb') as file: data = file.read() if len(data) >= 32: hash = data[:32] cubin = data[32:] cubin_hash = hashlib.md5(cubin).hexdigest().encode('ascii') if hash == cubin_hash: mod.load(cubin) return mod else: # Enforce compiling -- the resulting kernel will be cached elsewhere, # so we do nothing pass if backend == 'nvrtc': cu_name = '' if cache_in_memory else name + '.cu' ptx, mapping = compile_using_nvrtc( source, options, arch, cu_name, name_expressions, log_stream, cache_in_memory, jitify) if _is_cudadevrt_needed(options): # for separate compilation ls = function.LinkState() ls.add_ptr_data(ptx, 'cupy.ptx') _cudadevrt = _get_cudadevrt_path() ls.add_ptr_file(_cudadevrt) cubin = ls.complete() else: cubin = ptx mod._set_mapping(mapping) elif backend == 'nvcc': rdc = _is_cudadevrt_needed(options) cubin = compile_using_nvcc(source, options, arch, name + '.cu', code_type='cubin', separate_compilation=rdc, log_stream=log_stream) else: raise ValueError('Invalid backend %s' % backend) if not cache_in_memory: # Write to disk cache cubin_hash = hashlib.md5(cubin).hexdigest().encode('ascii') # shutil.move is not atomic operation, so it could result in a # corrupted file. We detect it by appending md5 hash at the beginning # of each cache file. If the file is corrupted, it will be ignored # next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(cubin_hash) tf.write(cubin) temp_path = tf.name shutil.move(temp_path, path) # Save .cu source file along with .cubin if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False): with open(path + '.cu', 'w') as f: f.write(source) else: # we don't do any disk I/O pass mod.load(cubin) return mod
def compile_with_cache(source, options=(), arch=None, cache_dir=None, extra_source=None): # NVRTC does not use extra_source. extra_source is used for cache key. global _empty_file_preprocess_cache if cache_dir is None: cache_dir = get_cache_dir() if arch is None: arch = _get_arch() options += ('-ftz=true', ) if _get_bool_env_variable('CUPY_CUDA_COMPILE_WITH_DEBUG', False): options += ('--device-debug', '--generate-line-info') env = (arch, options, _get_nvrtc_version()) base = _empty_file_preprocess_cache.get(env, None) if base is None: # This is checking of NVRTC compiler internal version base = _preprocess('', options, arch) _empty_file_preprocess_cache[env] = base key_src = '%s %s %s %s' % (env, base, source, extra_source) key_src = key_src.encode('utf-8') name = '%s_2.cubin' % hashlib.md5(key_src).hexdigest() if not os.path.isdir(cache_dir): try: os.makedirs(cache_dir) except OSError: if not os.path.isdir(cache_dir): raise mod = function.Module() # To handle conflicts in concurrent situation, we adopt lock-free method # to avoid performance degradation. path = os.path.join(cache_dir, name) if os.path.exists(path): with open(path, 'rb') as file: data = file.read() if len(data) >= 32: hash = data[:32] cubin = data[32:] cubin_hash = six.b(hashlib.md5(cubin).hexdigest()) if hash == cubin_hash: mod.load(cubin) return mod ptx = compile_using_nvrtc(source, options, arch, name + '.cu') ls = function.LinkState() ls.add_ptr_data(ptx, u'cupy.ptx') cubin = ls.complete() cubin_hash = six.b(hashlib.md5(cubin).hexdigest()) # shutil.move is not atomic operation, so it could result in a corrupted # file. We detect it by appending md5 hash at the beginning of each cache # file. If the file is corrupted, it will be ignored next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(cubin_hash) tf.write(cubin) temp_path = tf.name shutil.move(temp_path, path) # Save .cu source file along with .cubin if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False): with open(path + '.cu', 'w') as f: f.write(source) mod.load(cubin) return mod