Example #1
0
    def test_clear_caches_persistance(self):
        temp_dir = tempfile.mkdtemp()
        try:
            h1 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')
            test_suggest = h1.suggest('testing')
            test_stem = h1.stem('testing')

            h1._suggest_cache['made-up'] = test_suggest
            self.assertEqual(h1.suggest('made-up'), test_suggest)
            h1._stem_cache['made-up'] = test_stem
            self.assertEqual(h1.stem('made-up'), test_stem)

            h1.save_cache()
            h1.clear_cache()
            del h1

            cacheman = get_cache_manager('disk_hun')
            cacheman.deregister_all_caches()
            self.assertEqual(len(cacheman.cache_by_name), 0)

            h2 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')

            self.assertEqual(len(h2._suggest_cache), 0)
            self.assertEqual(len(h2._stem_cache), 0)
            self.assertNotEqual(h2.suggest('made-up'), test_suggest)
            self.assertNotEqual(h2.stem('made-up'), test_stem)
        finally:
            shutil.rmtree(temp_dir) # Nuke temp content
class SpellChecker:
    def __init__(self,
                 language='en_US',
                 hunspell_data_dir='./hunspell',
                 n_jobs=1):
        SpellChecker.get_dict(language, hunspell_data_dir)
        self.hunspell = Hunspell(language,
                                 hunspell_data_dir=hunspell_data_dir,
                                 disk_cache_dir=os.path.join(
                                     hunspell_data_dir, 'cache'))
        self.hunspell.set_concurrency(n_jobs)
        self.substitutes = dict()

    def spell_check(self, tokenized_corpus_2d):
        tokens = {t for iterable in tokenized_corpus_2d for t in iterable}
        new_tokens = tokens - self.substitutes.keys()
        correct_tokens = {t for t in new_tokens if self.hunspell.spell(t)}
        self.substitutes.update(map(lambda t: (t, t), correct_tokens))
        tokens_to_check = new_tokens - correct_tokens
        suggestions = self.hunspell.bulk_suggest(tokens_to_check)
        self.substitutes.update(
            map(lambda kv: (kv[0], kv[0]) if not kv[1] else (kv[0], kv[1][0]),
                suggestions.items()))
        new_corpus = [[self.substitutes[token] for token in iterable]
                      for iterable in tokenized_corpus_2d]
        return new_corpus

    @staticmethod
    def get_dict(language, data_dir):
        os.makedirs(data_dir, exist_ok=True)
        for ext in ['aff', 'dic']:
            path = os.path.join(data_dir, '%s.%s' % (language, ext))
            if not os.path.exists(path):
                r = get(
                    'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s'
                    % (language, language, ext))
                if r.status_code == 404:
                    l = language[0:language.find('_')]
                    r = get(
                        'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s'
                        % (l, language, ext))
                    r.raise_for_status()
                f = open(path, 'wb')
                f.write(r.content)
                f.close()

    def __del__(self):
        self.hunspell.save_cache()  # For future program executions.
Example #3
0
def test_clear_caches_persistance(hunspell):
    temp_dir = tempfile.mkdtemp()
    try:
        h1 = Hunspell('test',
                      hunspell_data_dir=DICT_DIR,
                      disk_cache_dir=temp_dir,
                      cache_manager='disk_hun')
        test_suggest = h1.suggest('testing')
        test_suffix = h1.suffix_suggest('testing')
        test_stem = h1.stem('testing')

        h1._suggest_cache['made-up'] = test_suggest
        assert h1.suggest('made-up') == test_suggest
        h1._suffix_cache['made-up'] = test_suffix
        assert h1.suffix_suggest('made-up') == test_suffix
        h1._stem_cache['made-up'] = test_stem
        assert h1.stem('made-up') == test_stem

        h1.save_cache()
        h1.clear_cache()
        del h1

        cacheman = get_cache_manager('disk_hun')
        cacheman.deregister_all_caches()
        assert len(cacheman.cache_by_name) == 0

        h2 = Hunspell('test',
                      hunspell_data_dir=DICT_DIR,
                      disk_cache_dir=temp_dir,
                      cache_manager='disk_hun')

        assert len(h2._suggest_cache) == 0
        assert len(h2._stem_cache) == 0
        assert h2.suggest('made-up') != test_suggest
        assert h2.suffix_suggest('made-up') != test_suffix
        assert h2.stem('made-up') != test_stem
    finally:
        shutil.rmtree(temp_dir)  # Nuke temp content