Beispiel #1
0
    def __init__(self, languages=['en'],
                 noload=False,
                 storage_location=STORAGE_DIRECTORY_PATH,
                 hit_handler=hitmanager.new_hit,
                 anagram_test=anagramfunctions.test_anagram):
        """
        language selection is not currently implemented
        """
        self.languages = languages
        self._should_trim_cache = False
        self._write_process = None
        self._lock = multiprocessing.Lock()
        self._is_writing = multiprocessing.Event()
        self.dbpath = (storage_location +
                       DATA_PATH_COMPONENT +
                       '_'.join(self.languages) + '.db')
        self.cachepath = (storage_location +
                          CACHE_PATH_COMPONENT +
                          '_'.join(self.languages) + '.p')

        self.hit_handler = hit_handler
        self.anagram_test = anagram_test

        if noload:
            self.cache = AnagramSimpleStore()
            self.datastore = None
        else:
            self.cache = AnagramSimpleStore(self.cachepath, ANAGRAM_CACHE_SIZE)
            self.datastore = multidbm.MultiDBM(self.dbpath)
Beispiel #2
0
class AnagramFinder(object):

    """
    AnagramFinder handles the storage, retrieval and comparisons
    of anagram candidates.
    It caches newly returned or requested candidates to memory,
    and maintains & manages a persistent database of older candidates.
    """

    def __init__(self, languages=['en'],
                 noload=False,
                 storage_location=STORAGE_DIRECTORY_PATH,
                 hit_handler=hitmanager.new_hit,
                 anagram_test=anagramfunctions.test_anagram):
        """
        language selection is not currently implemented
        """
        self.languages = languages
        self._should_trim_cache = False
        self._write_process = None
        self._lock = multiprocessing.Lock()
        self._is_writing = multiprocessing.Event()
        self.dbpath = (storage_location +
                       DATA_PATH_COMPONENT +
                       '_'.join(self.languages) + '.db')
        self.cachepath = (storage_location +
                          CACHE_PATH_COMPONENT +
                          '_'.join(self.languages) + '.p')

        self.hit_handler = hit_handler
        self.anagram_test = anagram_test

        if noload:
            self.cache = AnagramSimpleStore()
            self.datastore = None
        else:
            self.cache = AnagramSimpleStore(self.cachepath, ANAGRAM_CACHE_SIZE)
            self.datastore = multidbm.MultiDBM(self.dbpath)

    def handle_input(self, inp, text_key="text"):
        """
        takes either a string or a dict, and compares it against
        all previous input. if an anagram is found, runs self.anagram_test
        and then self.hit_handler if test passes.
        """
        text = self._text_from_input(inp, text_key)
        key = anagramfunctions.improved_hash(text)
        if key in self.cache:
            stats.cache_hit()
            match = self.cache[key]
            match_text = self._text_from_input(match, key)
            if self.anagram_test(text, match_text):
                del self.cache[key]
                self.hit_handler(inp, match)
            else:
                # anagram, but fails tests (too similar)
                self.cache[key] = inp
        else:
            # not in cache. in datastore?
            if key in self.datastore:
                self._process_hit(inp, key, text_key)
            else:
                # not in datastore. add to cache
                self.cache[key] = inp
                stats.set_cache_size(len(self.cache))

                if len(self.cache) > ANAGRAM_CACHE_SIZE:
                    self._trim_cache()

    def _process_hit(self, inp, key, text_key):
        try:
            hit = _tweet_from_dbm(self.datastore[key])
            hit_text = self._text_from_input(hit, text_key)
            text = self._text_from_input(inp, text_key)
        except (UnicodeDecodeError, ValueError) as err:
            print('error decoding hit for key %s' % key)
            self.cache[key] = inp
            return
        stats.possible_hit()
        if self.anagram_test(text, hit_text):
            self.hit_handler(inp, hit)
        else:
            self.cache[key] = inp

    def _text_from_input(self, inp, key=None):
        LEGACY_KEY = 'tweet_text'
        if isinstance(inp, unicode):
            return inp
        else:
            text = inp.get(key) or inp.get(LEGACY_KEY)
            if not text:
                raise TypeError('expected string or dict')
            return text

    def _trim_cache(self, to_trim=None):
        """
        takes least frequently hit tweets from cache and writes to datastore
        """
        self._should_trim_cache = False

        if not to_trim:
            to_trim = min(10000, (ANAGRAM_CACHE_SIZE / 10))

        to_store = self.cache.least_used(to_trim)
        # write those caches to disk, delete from cache, add to hashes
        for x in to_store:
            self.datastore[x] = _dbm_from_tweet(self.cache[x])
            del self.cache[x]

        buffer_size = stats.buffer_size()
        if buffer_size > ANAGRAM_STREAM_BUFFER_SIZE:
            print('raised needs maintenance')
            raise NeedsMaintenance

    def perform_maintenance(self):
        """
        called when we're not keeping up with input.
        moves current database elsewhere and starts again with new db
        """
        print("perform maintenance called")
        # save our current cache to be restored after we run _setup (hacky)
        moveddb = self.datastore.archive()
        print('moved mdbm chunk: %s' % moveddb)
        print('mdbm contains %s chunks' % self.datastore.section_count())

    def close(self):
        if self._write_process and self._write_process.is_alive():
            print('write process active. waiting.')
            self._write_process.join()

        self.cache.save()
        self.datastore.close()