def __init__( self, identifier = None, #name under which the cache file is stored. defaults to modulename.objname environment = None, #object containing information regarding the dependencies on global state of the cached operation operation = None, #function to be cached. note that the order of arguments is significant for key reuse hierarchy = None, #key hierarchy; if and how args and kwargs are hierarchically ordered validate = False, #validation mode. if enabled, all cache retrievals are checked against a recomputed function call. deferred_timeout = 30, #time to wait before a deferred object is considered obsolete. compilation may take a long time; that said, it may also crash your process... lock_timeout = 1, #time to wait before a lock is considered obsolete. the lock is needed for pure db transactions only; this makes once second a long time environment_clear = True, #clear the cache upon connection with a novel environment key connect_clear = False #clear the cache upon every connection ): """ if environment_clear is set to true, the cache is cleared """ if identifier: self.identifier = identifier if operation: self.operation = operation self.hierarchy = hierarchy #add some essentials to the environment import platform globalenv = platform.architecture(), platform.python_version() funcenv = inspect.getargspec(self.operation), inspect.getsource(self.operation) self.environment = globalenv, funcenv, (environment if environment else self.environment()) estr, ehash = process_key(self.environment) self.validate = validate self.lock_timeout = lock_timeout self.deferred_timeout = deferred_timeout self.filename = os.path.join(cachepath, self.identifier) self.shelve = Shelve(self.filename, autocommit = True) self.lock = threading.Lock() self.lock_file = lockfile.MkdirLockFile(self.filename, timeout = lock_timeout) with self.lock, self.lock_file: if connect_clear: #this isnt right; we are now invalidating the precomputed envrowid of other processes... self.shelve.clear() #need write lock here #write environment key to database and obtain its unique rowid try: self.envrowid = self.shelve.getrowid(self.environment, estr, ehash) except: #connect to the db with a novel environment; probably wont change back again if environment_clear: self.shelve.clear() #need write lock here self.shelve.setitem(self.environment, None, estr, ehash) self.envrowid = self.shelve.getrowid(self.environment, estr, ehash)
class AbstractCache(object): """ abstract base class of a cache object which gracefully handles large arbitrary key objects """ def __init__( self, identifier = None, #name under which the cache file is stored. defaults to modulename.objname environment = None, #object containing information regarding the dependencies on global state of the cached operation operation = None, #function to be cached. note that the order of arguments is significant for key reuse hierarchy = None, #key hierarchy; if and how args and kwargs are hierarchically ordered validate = False, #validation mode. if enabled, all cache retrievals are checked against a recomputed function call. deferred_timeout = 30, #time to wait before a deferred object is considered obsolete. compilation may take a long time; that said, it may also crash your process... lock_timeout = 1, #time to wait before a lock is considered obsolete. the lock is needed for pure db transactions only; this makes once second a long time environment_clear = True, #clear the cache upon connection with a novel environment key connect_clear = False #clear the cache upon every connection ): """ if environment_clear is set to true, the cache is cleared """ if identifier: self.identifier = identifier if operation: self.operation = operation self.hierarchy = hierarchy #add some essentials to the environment import platform globalenv = platform.architecture(), platform.python_version() funcenv = inspect.getargspec(self.operation), inspect.getsource(self.operation) self.environment = globalenv, funcenv, (environment if environment else self.environment()) estr, ehash = process_key(self.environment) self.validate = validate self.lock_timeout = lock_timeout self.deferred_timeout = deferred_timeout self.filename = os.path.join(cachepath, self.identifier) self.shelve = Shelve(self.filename, autocommit = True) self.lock = threading.Lock() self.lock_file = lockfile.MkdirLockFile(self.filename, timeout = lock_timeout) with self.lock, self.lock_file: if connect_clear: #this isnt right; we are now invalidating the precomputed envrowid of other processes... self.shelve.clear() #need write lock here #write environment key to database and obtain its unique rowid try: self.envrowid = self.shelve.getrowid(self.environment, estr, ehash) except: #connect to the db with a novel environment; probably wont change back again if environment_clear: self.shelve.clear() #need write lock here self.shelve.setitem(self.environment, None, estr, ehash) self.envrowid = self.shelve.getrowid(self.environment, estr, ehash) def __call__(self, *args, **kwargs): """ look up a hierachical key object fill in the missing parts, and perform the computation at the leaf if so required """ if self.hierarchy: #apply the structure in hierarchy to the arguments fkey = kwargs.copy() fkey.update(enumerate(args)) hkey = [[fkey.pop(a) for a in level] for level in self.hierarchy] if fkey: hkey.append(fkey) #any arguments not part of the hierarchy spec are placed at the end else: hkey = [args + ((kwargs,) if kwargs else ())] #put all args in a single key #preprocess subkeys. this minimizes time spent in locked state hkey = map(as_deterministic, hkey) with self.lock: #fairly stupid thread locking. dont use threads; how about that? while True: try: with self.lock_file: #hierarchical key lookup; first key is prebound environment key previouskey = Partial(self.envrowid) for ikey, subkey in enumerate(hkey[:-1]): partialkey = previouskey, subkey rowid = self.shelve.getrowid(partialkey, *process_key(partialkey)) #read lock? previouskey = Partial(rowid) #leaf iteration ikey = len(hkey)-1 leafkey = previouskey, hkey[-1] value = self.shelve[leafkey] #read lock? if isinstance(value, Deferred): if value.expired(self.deferred_timeout): raise Exception() sleep(0.01) else: if self.validate: #check if recomputed value is identical under deterministic serialization newvalue = self.operation(*args, **kwargs) try: #note; new may differ from old in case aliasing in an ndarray was erased #by original serialization. is this an error? #id say so; depending on wether we have a cache hit, downstream code may react diffently #perhaps its best to use custom serializating for values too assert(as_deterministic(value)==as_deterministic(newvalue)) except: print 'Cache returned invalid value!' print 'arguments:' print args print kwargs print 'cached value' print value print 'recomputed value' print newvalue quit() #yes! hitting this return is what we are doing this all for! return value except: #lock for the writing branch. multiprocess does not benefit here, but so be it. #worst case we make multiple insertions into db, but this should do no harm for behavior if self.lock_file.is_locked(): #if lock not available, better to go back to waiting for a deferred to appear sleep(0.001) else: with self.lock_file: #hierarchical key insertion for subkey in hkey[ikey:-1]: partialkey = previouskey, subkey kstr, khash = process_key(partialkey) self.shelve.setitem(partialkey, None, kstr, khash) #wite lock rowid = self.shelve.getrowid(partialkey, kstr, khash) #read lock previouskey = Partial(rowid) #insert leaf node leafkey = previouskey, hkey[-1] kstr, khash = process_key(leafkey) self.shelve.setitem(leafkey, Deferred(), kstr, khash) #write lock #dont need lock while doing expensive things value = self.operation(*args, **kwargs) with self.lock_file: self.shelve.setitem(leafkey, value , kstr, khash) #write lock return value def operation(self, input): """ implements the cached operation; to be invoked upon a cache miss input is a picklable python object the returned output should be a pickalble python object as well """ raise NotImplementedError() def environment(self): """ returns a pickeable object describing the environment of the cached operation, or a description of the state of your computer which may influence the relation between the input and output of Cache.cached """ raise NotImplementedError()