def _store(self, obj, form, bcs, tsfc_parameters): key = self._cache_key(form) if self.invalid_count[key] > parameters["assembly_cache"]["max_misses"]: if self.invalid_count[key] == \ parameters["assembly_cache"]["max_misses"] + 1: debug("form %s missed too many times, excluding from cache." % form) else: cache_entry = _CacheEntry(obj, form, bcs) self.cache[key] = str(tsfc_parameters), cache_entry self.evict()
def _store(self, obj, form, bcs, ffc_parameters): form_sig = form.signature() if self.invalid_count[form_sig] > parameters["assembly_cache"]["max_misses"]: if self.invalid_count[form_sig] == \ parameters["assembly_cache"]["max_misses"] + 1: debug("form %s missed too many times, excluding from cache." % form) else: cache_entry = _CacheEntry(obj, form, bcs) self.cache[form_sig] = str(ffc_parameters), cache_entry self.evict()
def _store(self, obj, form, bcs, ffc_parameters): key = self._cache_key(form) if self.invalid_count[key] > parameters["assembly_cache"]["max_misses"]: if self.invalid_count[key] == \ parameters["assembly_cache"]["max_misses"] + 1: debug("form %s missed too many times, excluding from cache." % form) else: cache_entry = _CacheEntry(obj, form, bcs) self.cache[key] = str(ffc_parameters), cache_entry self.evict()
def compilation_comm(comm): """Get a communicator for compilation. :arg comm: The input communicator. :returns: A communicator used for compilation (may be smaller) """ # Should we try and do node-local compilation? if not configuration["node_local_compilation"]: return comm retcomm = get_compilation_comm(comm) if retcomm is not None: debug("Found existing compilation communicator") return retcomm if MPI.VERSION >= 3: debug("Creating compilation communicator using MPI_Split_type") retcomm = comm.Split_type(MPI.COMM_TYPE_SHARED) set_compilation_comm(comm, retcomm) return retcomm debug("Creating compilation communicator using MPI_Split + filesystem") import tempfile if comm.rank == 0: if not os.path.exists(configuration["cache_dir"]): os.makedirs(configuration["cache_dir"]) tmpname = tempfile.mkdtemp(prefix="rank-determination-", dir=configuration["cache_dir"]) else: tmpname = None tmpname = comm.bcast(tmpname, root=0) if tmpname is None: raise CompilationError("Cannot determine sharedness of filesystem") # Touch file with open(os.path.join(tmpname, str(comm.rank)), "wb"): pass comm.barrier() import glob ranks = sorted( int(os.path.basename(name)) for name in glob.glob("%s/[0-9]*" % tmpname)) retcomm = comm.Split(color=min(ranks), key=comm.rank) set_compilation_comm(comm, retcomm) return retcomm
def compilation_comm(comm): """Get a communicator for compilation. :arg comm: The input communicator. :returns: A communicator used for compilation (may be smaller) """ # Should we try and do node-local compilation? if not configuration["node_local_compilation"]: return comm retcomm = get_compilation_comm(comm) if retcomm is not None: debug("Found existing compilation communicator") return retcomm if MPI.VERSION >= 3: debug("Creating compilation communicator using MPI_Split_type") retcomm = comm.Split_type(MPI.COMM_TYPE_SHARED) set_compilation_comm(comm, retcomm) return retcomm debug("Creating compilation communicator using MPI_Split + filesystem") import tempfile if comm.rank == 0: if not os.path.exists(configuration["cache_dir"]): os.makedirs(configuration["cache_dir"], exist_ok=True) tmpname = tempfile.mkdtemp(prefix="rank-determination-", dir=configuration["cache_dir"]) else: tmpname = None tmpname = comm.bcast(tmpname, root=0) if tmpname is None: raise CompilationError("Cannot determine sharedness of filesystem") # Touch file with open(os.path.join(tmpname, str(comm.rank)), "wb"): pass comm.barrier() import glob ranks = sorted(int(os.path.basename(name)) for name in glob.glob("%s/[0-9]*" % tmpname)) retcomm = comm.Split(color=min(ranks), key=comm.rank) set_compilation_comm(comm, retcomm) return retcomm
def get_so(self, src, extension): """Build a shared library and load it :arg src: The source string to compile. :arg extension: extension of the source file (c, cpp). Returns a :class:`ctypes.CDLL` object of the resulting shared library.""" # Determine cache key hsh = md5(src.encode()) hsh.update(self._cc.encode()) if self._ld: hsh.update(self._ld.encode()) hsh.update("".join(self._cppargs).encode()) hsh.update("".join(self._ldargs).encode()) basename = hsh.hexdigest() cachedir = configuration['cache_dir'] pid = os.getpid() cname = os.path.join(cachedir, "%s_p%d.%s" % (basename, pid, extension)) oname = os.path.join(cachedir, "%s_p%d.o" % (basename, pid)) soname = os.path.join(cachedir, "%s.so" % basename) # Link into temporary file, then rename to shared library # atomically (avoiding races). tmpname = os.path.join(cachedir, "%s_p%d.so.tmp" % (basename, pid)) if configuration['check_src_hashes'] or configuration['debug']: matching = self.comm.allreduce(basename, op=_check_op) if matching != basename: # Dump all src code to disk for debugging output = os.path.join(cachedir, "mismatching-kernels") srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank) if self.comm.rank == 0: if not os.path.exists(output): os.makedirs(output, exist_ok=True) self.comm.barrier() with open(srcfile, "w") as f: f.write(src) self.comm.barrier() raise CompilationError("Generated code differs across ranks (see output in %s)" % output) try: # Are we in the cache? return ctypes.CDLL(soname) except OSError: # No, let's go ahead and build if self.comm.rank == 0: # No need to do this on all ranks if not os.path.exists(cachedir): os.makedirs(cachedir, exist_ok=True) logfile = os.path.join(cachedir, "%s_p%d.log" % (basename, pid)) errfile = os.path.join(cachedir, "%s_p%d.err" % (basename, pid)) with progress(INFO, 'Compiling wrapper'): with open(cname, "w") as f: f.write(src) # Compiler also links if self._ld is None: cc = [self._cc] + self._cppargs + \ ['-o', tmpname, cname] + self._ldargs debug('Compilation command: %s', ' '.join(cc)) with open(logfile, "w") as log: with open(errfile, "w") as err: log.write("Compilation command:\n") log.write(" ".join(cc)) log.write("\n\n") try: if configuration['no_fork_available']: cc += ["2>", errfile, ">", logfile] cmd = " ".join(cc) status = os.system(cmd) if status != 0: raise subprocess.CalledProcessError(status, cmd) else: subprocess.check_call(cc, stderr=err, stdout=log) except subprocess.CalledProcessError as e: raise CompilationError( """Command "%s" return error status %d. Unable to compile code Compile log in %s Compile errors in %s""" % (e.cmd, e.returncode, logfile, errfile)) else: cc = [self._cc] + self._cppargs + \ ['-c', '-o', oname, cname] ld = self._ld.split() + ['-o', tmpname, oname] + self._ldargs debug('Compilation command: %s', ' '.join(cc)) debug('Link command: %s', ' '.join(ld)) with open(logfile, "w") as log: with open(errfile, "w") as err: log.write("Compilation command:\n") log.write(" ".join(cc)) log.write("\n\n") log.write("Link command:\n") log.write(" ".join(ld)) log.write("\n\n") try: if configuration['no_fork_available']: cc += ["2>", errfile, ">", logfile] ld += ["2>", errfile, ">", logfile] cccmd = " ".join(cc) ldcmd = " ".join(ld) status = os.system(cccmd) if status != 0: raise subprocess.CalledProcessError(status, cccmd) status = os.system(ldcmd) if status != 0: raise subprocess.CalledProcessError(status, ldcmd) else: subprocess.check_call(cc, stderr=err, stdout=log) subprocess.check_call(ld, stderr=err, stdout=log) except subprocess.CalledProcessError as e: raise CompilationError( """Command "%s" return error status %d. Unable to compile code Compile log in %s Compile errors in %s""" % (e.cmd, e.returncode, logfile, errfile)) # Atomically ensure soname exists os.rename(tmpname, soname) # Wait for compilation to complete self.comm.barrier() # Load resulting library return ctypes.CDLL(soname)
def evict(self): """Run the cache eviction algorithm. This works out the permitted cache size and deletes objects until it is achieved. Cache values are assumed to have a :attr:`value` attribute and eviction occurs in increasing :attr:`value` order. Currently :attr:`value` is an index of the assembly operation, so older operations are evicted first. The cache will be evicted down to 90% of permitted size. The permitted size is either the explicit :data:`parameters["assembly_cache"]["max_bytes"]` or it is the amount of memory per core scaled by :data:`parameters["assembly_cache"]["max_factor"]` (by default the scale factor is 0.6). In MPI parallel, the nbytes of each cache entry is set to the maximum over all processes, while the available memory is set to the minimum. This produces a conservative caching policy which is guaranteed to result in the same evictions on each processor. """ if not parameters["assembly_cache"]["eviction"]: return max_cache_size = min(parameters["assembly_cache"]["max_bytes"] or float("inf"), (memory or float("inf")) * parameters["assembly_cache"]["max_factor"] ) if max_cache_size == float("inf"): if not self.evictwarned: warning("No maximum assembly cache size. Install psutil >= 2.0.0 or risk leaking memory!") self.evictwarned = True return cache_size = self.nbytes if cache_size < max_cache_size: return debug("Cache eviction triggered. %s bytes in cache, %s bytes allowed" % (cache_size, max_cache_size)) # Evict down to 90% full. bytes_to_evict = cache_size - 0.9 * max_cache_size sorted_cache = sorted(self.cache.items(), key=lambda x: x[1][1].value) nbytes = lambda x: x[1][1].nbytes candidates = [] while bytes_to_evict > 0: next = sorted_cache.pop(0) candidates.append(next) bytes_to_evict -= nbytes(next) for c in reversed(candidates): if bytes_to_evict + nbytes(c) < 0: # We may have been overzealous. bytes_to_evict += nbytes(c) else: del self.cache[c[0]]
from pyop2.mpi import MPI from pyop2.logger import warning, debug from pyop2.utils import flatten try: from pyslope import slope backend = os.environ.get('SLOPE_BACKEND') if backend not in ['SEQUENTIAL', 'OMP']: backend = 'SEQUENTIAL' if MPI.COMM_WORLD.size > 1: if backend == 'SEQUENTIAL': backend = 'ONLY_MPI' if backend == 'OMP': backend = 'OMP_MPI' slope.set_exec_mode(backend) debug("SLOPE backend set to %s" % backend) except ImportError: slope = None lazy_trace_name = 'lazy_trace' """The default name for sequences of lazily evaluated :class:`ParLoop`s.""" from pyop2.fusion.transformer import Inspector from pyop2.fusion import extended def fuse(name, loop_chain, **kwargs): """Apply fusion (and possibly tiling) to an iterator of :class:`ParLoop` obecjts, which we refer to as ``loop_chain``. Return an iterator of :class:`ParLoop` objects, in which some loops may have been fused or tiled. If fusion could not be applied, return the unmodified ``loop_chain``.
def get_so(self, jitmodule, extension): """Build a shared library and load it :arg jitmodule: The JIT Module which can generate the code to compile. :arg extension: extension of the source file (c, cpp). Returns a :class:`ctypes.CDLL` object of the resulting shared library.""" # Determine cache key hsh = md5(str(jitmodule.cache_key).encode()) hsh.update(self._cc.encode()) if self._ld: hsh.update(self._ld.encode()) hsh.update("".join(self._cppargs).encode()) hsh.update("".join(self._ldargs).encode()) basename = hsh.hexdigest() cachedir = configuration['cache_dir'] dirpart, basename = basename[:2], basename[2:] cachedir = os.path.join(cachedir, dirpart) pid = os.getpid() cname = os.path.join(cachedir, "%s_p%d.%s" % (basename, pid, extension)) oname = os.path.join(cachedir, "%s_p%d.o" % (basename, pid)) soname = os.path.join(cachedir, "%s.so" % basename) # Link into temporary file, then rename to shared library # atomically (avoiding races). tmpname = os.path.join(cachedir, "%s_p%d.so.tmp" % (basename, pid)) if configuration['check_src_hashes'] or configuration['debug']: matching = self.comm.allreduce(basename, op=_check_op) if matching != basename: # Dump all src code to disk for debugging output = os.path.join(cachedir, "mismatching-kernels") srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank) if self.comm.rank == 0: os.makedirs(output, exist_ok=True) self.comm.barrier() with open(srcfile, "w") as f: f.write(jitmodule.code_to_compile) self.comm.barrier() raise CompilationError("Generated code differs across ranks (see output in %s)" % output) try: # Are we in the cache? return ctypes.CDLL(soname) except OSError: # No, let's go ahead and build if self.comm.rank == 0: # No need to do this on all ranks os.makedirs(cachedir, exist_ok=True) logfile = os.path.join(cachedir, "%s_p%d.log" % (basename, pid)) errfile = os.path.join(cachedir, "%s_p%d.err" % (basename, pid)) with progress(INFO, 'Compiling wrapper'): with open(cname, "w") as f: f.write(jitmodule.code_to_compile) # Compiler also links if self._ld is None: cc = [self._cc] + self._cppargs + \ ['-o', tmpname, cname] + self._ldargs debug('Compilation command: %s', ' '.join(cc)) with open(logfile, "w") as log: with open(errfile, "w") as err: log.write("Compilation command:\n") log.write(" ".join(cc)) log.write("\n\n") try: if configuration['no_fork_available']: cc += ["2>", errfile, ">", logfile] cmd = " ".join(cc) status = os.system(cmd) if status != 0: raise subprocess.CalledProcessError(status, cmd) else: subprocess.check_call(cc, stderr=err, stdout=log) except subprocess.CalledProcessError as e: raise CompilationError( """Command "%s" return error status %d. Unable to compile code Compile log in %s Compile errors in %s""" % (e.cmd, e.returncode, logfile, errfile)) else: cc = [self._cc] + self._cppargs + \ ['-c', '-o', oname, cname] ld = self._ld.split() + ['-o', tmpname, oname] + self._ldargs debug('Compilation command: %s', ' '.join(cc)) debug('Link command: %s', ' '.join(ld)) with open(logfile, "w") as log: with open(errfile, "w") as err: log.write("Compilation command:\n") log.write(" ".join(cc)) log.write("\n\n") log.write("Link command:\n") log.write(" ".join(ld)) log.write("\n\n") try: if configuration['no_fork_available']: cc += ["2>", errfile, ">", logfile] ld += ["2>", errfile, ">", logfile] cccmd = " ".join(cc) ldcmd = " ".join(ld) status = os.system(cccmd) if status != 0: raise subprocess.CalledProcessError(status, cccmd) status = os.system(ldcmd) if status != 0: raise subprocess.CalledProcessError(status, ldcmd) else: subprocess.check_call(cc, stderr=err, stdout=log) subprocess.check_call(ld, stderr=err, stdout=log) except subprocess.CalledProcessError as e: raise CompilationError( """Command "%s" return error status %d. Unable to compile code Compile log in %s Compile errors in %s""" % (e.cmd, e.returncode, logfile, errfile)) # Atomically ensure soname exists os.rename(tmpname, soname) # Wait for compilation to complete self.comm.barrier() # Load resulting library return ctypes.CDLL(soname)
def evict(self): """Run the cache eviction algorithm. This works out the permitted cache size and deletes objects until it is achieved. Cache values are assumed to have a :attr:`value` attribute and eviction occurs in increasing :attr:`value` order. Currently :attr:`value` is an index of the assembly operation, so older operations are evicted first. The cache will be evicted down to 90% of permitted size. The permitted size is either the explicit :data:`parameters["assembly_cache"]["max_bytes"]` or it is the amount of memory per core scaled by :data:`parameters["assembly_cache"]["max_factor"]` (by default the scale factor is 0.6). In MPI parallel, the nbytes of each cache entry is set to the maximum over all processes, while the available memory is set to the minimum. This produces a conservative caching policy which is guaranteed to result in the same evictions on each processor. """ if not parameters["assembly_cache"]["eviction"]: return max_cache_size = min( parameters["assembly_cache"]["max_bytes"] or float("inf"), (memory or float("inf")) * parameters["assembly_cache"]["max_factor"]) if max_cache_size == float("inf"): if not self.evictwarned: warning( "No maximum assembly cache size. Install psutil >= 2.0.0 or risk leaking memory!" ) self.evictwarned = True return cache_size = self.nbytes if cache_size < max_cache_size: return debug("Cache eviction triggered. %s bytes in cache, %s bytes allowed" % (cache_size, max_cache_size)) # Evict down to 90% full. bytes_to_evict = cache_size - 0.9 * max_cache_size sorted_cache = sorted(self.cache.items(), key=lambda x: x[1][1].value) nbytes = lambda x: x[1][1].nbytes candidates = [] while bytes_to_evict > 0: next = sorted_cache.pop(0) candidates.append(next) bytes_to_evict -= nbytes(next) for c in reversed(candidates): if bytes_to_evict + nbytes(c) < 0: # We may have been overzealous. bytes_to_evict += nbytes(c) else: del self.cache[c[0]]