def new_hunspell_nl() -> Hunspell: dictionary_path = __resolve_path("../dict/") hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path)) # add words that are not present in current dictionary for list in [get_plural_nouns(), get_basic_words()]: for word in list: if not hnspl.spell(word): hnspl.add(word) return hnspl
quoting=csv.QUOTE_NONE, names=['Id', 'EssaySet', 'essay_score1', 'essay_score2', 'EssayText'], dtype={ 'Id': str, 'EssaySet': str, 'essay_score1': np.int32, 'essay_score2': np.int32, 'EssayText': str }) gold_df = pd.concat([gold_df_train, gold_df_test]) words = " ".join(gold_df.EssayText).split() unique_words = list(dict.fromkeys(words)) for w in unique_words: if w in ['\uff1f', '\u2018', '\u2019']: continue spell.add(w) # On adversarials for i in range(1, 11): print('---prompt ' + str(i) + '---') for r in range(14, 15): print('threshold:' + str(r)) print('---Adver---') adver_TPR = [] df = pd.read_csv( 'adversarial/shallow/adversarial_prompt_' + str(i) + '.txt', encoding='utf-8', sep='\t', header=0, quoting=csv.QUOTE_NONE, names=[
class HunspellTest(unittest.TestCase): def assertRegexpSearch(self, *args, **kwargs): if PY3: self.assertRegex(*args, **kwargs) else: self.assertRegexpMatches(*args, **kwargs) def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(HunspellFilePathError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) @patch('os.path.isfile', return_value=True) @patch('os.access', return_value=True) def test_bad_path_encoding(self, *mocks): if PY3: with self.assertRaises(HunspellFilePathError): Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') else: # Python 2 just make an illegal string instead of raising with captured_c_stderr_file() as caperr: Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') with open(caperr, 'r') as err: self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding') @patch('hunspell.hunspell.WIN32_LONG_PATH_PREFIX', '/not/valid') def test_windows_utf_8_encoding_applies_prefix(self, *mocks): with captured_c_stderr_file() as caperr: with patch("os.name", 'nt'): # If python file existance checks used prefix, this would raise a HunspellFilePathError Hunspell('test', system_encoding='UTF-8') with open(caperr, 'r') as err: # But the Hunspell library lookup had the prefix applied self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*') def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_add(self): word = 'outofvocabularyword' self.assertEqual(self.h.spell(word), False) self.h.add(word) self.assertEqual(self.h.spell(word), True) typo = word + 'd' self.assertAllIn([word], self.h.suggest(typo)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) }) def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) def test_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertEqual(self.h.suggest('made-up'), test_suggest) self.assertEqual(self.h.stem('made-up'), test_stem) def test_save_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertNotEqual(len(h2._suggest_cache), 0) self.assertNotEqual(len(h2._stem_cache), 0) self.assertEqual(h2.suggest('made-up'), test_suggest) self.assertEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_non_peristance(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) self.h.clear_cache() del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertNotEqual(self.h.suggest('made-up'), test_suggest) self.assertNotEqual(self.h.stem('made-up'), test_stem)