def __init__(self, languages=['en'], noload=False, storage_location=STORAGE_DIRECTORY_PATH, hit_handler=hitmanager.new_hit, anagram_test=anagramfunctions.test_anagram): """ language selection is not currently implemented """ self.languages = languages self._should_trim_cache = False self._write_process = None self._lock = multiprocessing.Lock() self._is_writing = multiprocessing.Event() self.dbpath = (storage_location + DATA_PATH_COMPONENT + '_'.join(self.languages) + '.db') self.cachepath = (storage_location + CACHE_PATH_COMPONENT + '_'.join(self.languages) + '.p') self.hit_handler = hit_handler self.anagram_test = anagram_test if noload: self.cache = AnagramSimpleStore() self.datastore = None else: self.cache = AnagramSimpleStore(self.cachepath, ANAGRAM_CACHE_SIZE) self.datastore = multidbm.MultiDBM(self.dbpath)
class AnagramFinder(object): """ AnagramFinder handles the storage, retrieval and comparisons of anagram candidates. It caches newly returned or requested candidates to memory, and maintains & manages a persistent database of older candidates. """ def __init__(self, languages=['en'], noload=False, storage_location=STORAGE_DIRECTORY_PATH, hit_handler=hitmanager.new_hit, anagram_test=anagramfunctions.test_anagram): """ language selection is not currently implemented """ self.languages = languages self._should_trim_cache = False self._write_process = None self._lock = multiprocessing.Lock() self._is_writing = multiprocessing.Event() self.dbpath = (storage_location + DATA_PATH_COMPONENT + '_'.join(self.languages) + '.db') self.cachepath = (storage_location + CACHE_PATH_COMPONENT + '_'.join(self.languages) + '.p') self.hit_handler = hit_handler self.anagram_test = anagram_test if noload: self.cache = AnagramSimpleStore() self.datastore = None else: self.cache = AnagramSimpleStore(self.cachepath, ANAGRAM_CACHE_SIZE) self.datastore = multidbm.MultiDBM(self.dbpath) def handle_input(self, inp, text_key="text"): """ takes either a string or a dict, and compares it against all previous input. if an anagram is found, runs self.anagram_test and then self.hit_handler if test passes. """ text = self._text_from_input(inp, text_key) key = anagramfunctions.improved_hash(text) if key in self.cache: stats.cache_hit() match = self.cache[key] match_text = self._text_from_input(match, key) if self.anagram_test(text, match_text): del self.cache[key] self.hit_handler(inp, match) else: # anagram, but fails tests (too similar) self.cache[key] = inp else: # not in cache. in datastore? if key in self.datastore: self._process_hit(inp, key, text_key) else: # not in datastore. add to cache self.cache[key] = inp stats.set_cache_size(len(self.cache)) if len(self.cache) > ANAGRAM_CACHE_SIZE: self._trim_cache() def _process_hit(self, inp, key, text_key): try: hit = _tweet_from_dbm(self.datastore[key]) hit_text = self._text_from_input(hit, text_key) text = self._text_from_input(inp, text_key) except (UnicodeDecodeError, ValueError) as err: print('error decoding hit for key %s' % key) self.cache[key] = inp return stats.possible_hit() if self.anagram_test(text, hit_text): self.hit_handler(inp, hit) else: self.cache[key] = inp def _text_from_input(self, inp, key=None): LEGACY_KEY = 'tweet_text' if isinstance(inp, unicode): return inp else: text = inp.get(key) or inp.get(LEGACY_KEY) if not text: raise TypeError('expected string or dict') return text def _trim_cache(self, to_trim=None): """ takes least frequently hit tweets from cache and writes to datastore """ self._should_trim_cache = False if not to_trim: to_trim = min(10000, (ANAGRAM_CACHE_SIZE / 10)) to_store = self.cache.least_used(to_trim) # write those caches to disk, delete from cache, add to hashes for x in to_store: self.datastore[x] = _dbm_from_tweet(self.cache[x]) del self.cache[x] buffer_size = stats.buffer_size() if buffer_size > ANAGRAM_STREAM_BUFFER_SIZE: print('raised needs maintenance') raise NeedsMaintenance def perform_maintenance(self): """ called when we're not keeping up with input. moves current database elsewhere and starts again with new db """ print("perform maintenance called") # save our current cache to be restored after we run _setup (hacky) moveddb = self.datastore.archive() print('moved mdbm chunk: %s' % moveddb) print('mdbm contains %s chunks' % self.datastore.section_count()) def close(self): if self._write_process and self._write_process.is_alive(): print('write process active. waiting.') self._write_process.join() self.cache.save() self.datastore.close()