def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content
class SpellChecker: def __init__(self, language='en_US', hunspell_data_dir='./hunspell', n_jobs=1): SpellChecker.get_dict(language, hunspell_data_dir) self.hunspell = Hunspell(language, hunspell_data_dir=hunspell_data_dir, disk_cache_dir=os.path.join( hunspell_data_dir, 'cache')) self.hunspell.set_concurrency(n_jobs) self.substitutes = dict() def spell_check(self, tokenized_corpus_2d): tokens = {t for iterable in tokenized_corpus_2d for t in iterable} new_tokens = tokens - self.substitutes.keys() correct_tokens = {t for t in new_tokens if self.hunspell.spell(t)} self.substitutes.update(map(lambda t: (t, t), correct_tokens)) tokens_to_check = new_tokens - correct_tokens suggestions = self.hunspell.bulk_suggest(tokens_to_check) self.substitutes.update( map(lambda kv: (kv[0], kv[0]) if not kv[1] else (kv[0], kv[1][0]), suggestions.items())) new_corpus = [[self.substitutes[token] for token in iterable] for iterable in tokenized_corpus_2d] return new_corpus @staticmethod def get_dict(language, data_dir): os.makedirs(data_dir, exist_ok=True) for ext in ['aff', 'dic']: path = os.path.join(data_dir, '%s.%s' % (language, ext)) if not os.path.exists(path): r = get( 'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s' % (language, language, ext)) if r.status_code == 404: l = language[0:language.find('_')] r = get( 'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s' % (l, language, ext)) r.raise_for_status() f = open(path, 'wb') f.write(r.content) f.close() def __del__(self): self.hunspell.save_cache() # For future program executions.
def test_clear_caches_persistance(hunspell): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_suffix = h1.suffix_suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest assert h1.suggest('made-up') == test_suggest h1._suffix_cache['made-up'] = test_suffix assert h1.suffix_suggest('made-up') == test_suffix h1._stem_cache['made-up'] = test_stem assert h1.stem('made-up') == test_stem h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() assert len(cacheman.cache_by_name) == 0 h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') assert len(h2._suggest_cache) == 0 assert len(h2._stem_cache) == 0 assert h2.suggest('made-up') != test_suggest assert h2.suffix_suggest('made-up') != test_suffix assert h2.stem('made-up') != test_stem finally: shutil.rmtree(temp_dir) # Nuke temp content