def spell_corrector(df, lang1, lang2): #Create an object of the Hunspell class h = Hunspell() print('I am spell_checker') #An empty list to hold the corrected sentences which would later be made into a dataframe corr_sent_list = {'L1': [], 'L2': []} #For each sentence in the dataframe for sent in df['L1']: #Empty string to which the corrected words are appended corr_sent = '' #For every word in the sentence. Which is split by word boundary for w in re.split(r'\b', sent): #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent if not w.isalpha() or h.spell(w): corr_sent += w #If the split part is word and is incorrect else: #Suggest possible correct candidates to the incorrect word suggest = h.suggest(w) #If more than one word is suggested, more processing is required to select a word if len(suggest) > 1: #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word corr_sent += suggest[0] #If only one word is suggested, append it to corr_sent else: corr_sent += suggest[0] #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list corr_sent_list['L1'].append(corr_sent) #Convert the corrected sentences list into pandas dataframe to return if lang2 is not None: corr_sent_list['L2'].extend(list(df['L2'])) return pd.DataFrame.from_dict(corr_sent_list) else: return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])
class HunspellChecker(object): def __init__(self): self.checker = Hunspell() self.stopwords = set(SW.words("english")) | set(string.punctuation) def correct_word(self, word): """Borrowed from: https://datascience.blog.wzb.eu/2016/07/13/autocorrecting-misspelled-words-in-python-using-hunspell/ """ ok = self.checker.spell(word) # check spelling if not ok: suggestions = self.checker.suggest(word) if len(suggestions) > 0: # there are suggestions return suggestions[0] else: return word else: return word def correct_string(self, text, ensure_length=False): """Break into words and correct each word.""" tokens = text.split() corrected = [] for token in tokens: if token in self.stopwords: corrected.append(token) else: correction = self.correct_word(token) if ensure_length: corrected.append(correction.split()[0]) else: corrected.append(correction) return " ".join(corrected)
def make_checker(): ''' creates a checker depending on the system running :return: Hunspell object h ''' if platform.system() == 'Windows': h = Hunspell('de_DE_frami', hunspell_data_dir="C:\\Users\\Lena_Langholf\\Dropbox\\Spell_Checking\\dictionaries") else: h = Hunspell('de_DE_frami', hunspell_data_dir="/home/lena/Desktop/million_post_corpus/dictionaries") return h
def __init__(self, language='en_US', hunspell_data_dir='./hunspell', n_jobs=1): SpellChecker.get_dict(language, hunspell_data_dir) self.hunspell = Hunspell(language, hunspell_data_dir=hunspell_data_dir, disk_cache_dir=os.path.join( hunspell_data_dir, 'cache')) self.hunspell.set_concurrency(n_jobs) self.substitutes = dict()
def __init__(self, dialect, script): self.dialect = dialect self.script = script self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"} if self.dialect == "Sorani" and self.script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: if not (self.dialect == "Kurmanji" and self.script == "Latin"): raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")
def run(self) -> Generator[Tuple[int, int, str, type], None, None]: """Run the linter and return a generator of errors.""" with open(self.filename, 'r') as file: comments = get_comments(file.read()) # for comment in comments z = list(comments) spell = Hunspell() x = spell.spell(z[1][2][0]) print(x) yield (0, 0, f'KOL001 Bad language found: ', TypoChecker)
def new_hunspell_nl() -> Hunspell: dictionary_path = __resolve_path("../dict/") hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path)) # add words that are not present in current dictionary for list in [get_plural_nouns(), get_basic_words()]: for word in list: if not hnspl.spell(word): hnspl.add(word) return hnspl
def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem)
def test_bad_path_encoding(self, *mocks): if PY3: with self.assertRaises(HunspellFilePathError): Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') else: # Python 2 just make an illegal string instead of raising with captured_c_stderr_file() as caperr: Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') with open(caperr, 'r') as err: self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding')
def __init__( self, lang='en', max_dist=2, cpu=os.cpu_count(), # cache_manager="hunspell",disk_cache_dir=None, # hunspell_data_dir=None,system_encoding=None spell_kwargs={}): self.lang = self.langs.get(lang, lang) self.spell_dict = Hunspell(self.lang, **spell_kwargs) self.max_dist = max_dist self.spell_dict.set_concurrency(cpu)
def test_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertEqual(self.h.suggest('made-up'), test_suggest) self.assertEqual(self.h.stem('made-up'), test_stem)
def test_hunspell_bulk_stem(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertDictEqual(d.bulk_action("stem", ['dog', 'permanently']), { 'permanently': ['permanent'], 'dog': ['dog'] }) self.assertDictEqual(d.bulk_action("stem", ['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ['recorded'], 'permanently': ['permanent'], 'twigs': ['twig'], 'dog': ['dog'] }) del d
class HunspellChecker(object): def __init__(self): self.checker = Hunspell() def correct(self, word): if self.checker.spell(word) == True: return word else: res = self.checker.suggest(word) if res: return res[0] else: return word
def __init__(self): """Constructor.""" super().__init__() self.__treebank_tokenizer = TreebankWordTokenizer() hunspell_dict_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'hindi-hunspell', 'dict-hi_IN', ) if not os.path.isdir(hunspell_dict_dir): raise McLanguageException( "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')): raise McLanguageException( "Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')): raise McLanguageException( "Hunspell affix file does not exist at path: %s" % hunspell_dict_dir) try: self.__hindi_hunspell = Hunspell( lang='hi_IN', hunspell_data_dir=hunspell_dict_dir) except Exception as ex: raise McLanguageException( "Unable to initialize Hunspell with data directory '%s': %s" % ( hunspell_dict_dir, str(ex), )) # Quick self-test to make sure that Hunspell is installed and dictionary is available hunspell_exc_message = """ Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g. you might need to fetch Git submodules by running: git submodule update --init --recursive """ try: test_stems = self.stem_words(['गुरुओं']) except Exception as _: raise McLanguageException(hunspell_exc_message) else: if len(test_stems) == 0 or test_stems[0] != 'गुरु': raise McLanguageException(hunspell_exc_message)
def test_hunspell_bulk_stem(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertDictEqual(d.bulk_action("stem", ['dog', 'permanently']), { 'permanently': ['permanent'], 'dog': ['dog'] }) self.assertDictEqual( d.bulk_action("stem", ['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ['recorded'], 'permanently': ['permanent'], 'twigs': ['twig'], 'dog': ['dog'] }) del d
def test_clear_caches_non_peristance(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) self.h.clear_cache() del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertNotEqual(self.h.suggest('made-up'), test_suggest) self.assertNotEqual(self.h.stem('made-up'), test_stem)
def test_non_overlapping_caches(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) assert h2.suggest('made-up') != test_suggest assert h2.stem('made-up') != test_stem
class HunSpellCheckerClass: """ Check the spelling of a word. """ def __init__(self): """ Set up for the checking the spelling of a word. """ debug('Initializing Hunspell') self.word_check = Hunspell() # config_list = self.word_check.ConfigKeys() # # print(config_list:'encoding') # for config_item in config_list: # print('\n', config_item, config_list[config_item]) def check_word(self, test_word: str) -> bool: """ Check a word to see if it is spelled correctly. Note: It appears that a lot of abbreviations are in the aspell dictionary, such as 'ac' and 'cf'. I will just have to manually weed them out with the ole Mark One eyeball. :) :param test_word: word to check :return: true if spelled ok or false if not a valid word """ debug(f'check_word received {test_word}') result = self.word_check.spell(test_word.lower()) debug(f'check_word result {result}') return result
def loadResources(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) # Language model built by KenLM: https://github.com/kpu/kenlm lm = kenlm.Model(args.model) # Load spaCy nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners det = {"", "the", "a", "an"} # List of common prepositions prep = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } # Save the above in a dictionary: res_dict = { "lm": lm, "nlp": nlp, "gb": gb, "gb_infl": gb_infl, "det": det, "prep": prep } return res_dict
def create_fallback_options(singular: str, speller: Hunspell, options: [str], stems: [str]) -> AdvancedSingularizationResult: options.append(singular) if speller.spell(singular): return AdvancedSingularizationResult(options, singular, stems, True, True) else: return AdvancedSingularizationResult(options, None, stems, False, True)
def search_by_dictionary_plus_s(speller: Hunspell, singular: str) -> Union[None, SearchResult]: plural = singular + 's' if speller.spell(plural): return SearchResult(plural, None,"s") return None
def test_windows_utf_8_encoding_applies_prefix(self, *mocks): with captured_c_stderr_file() as caperr: with patch("os.name", 'nt'): # If python file existance checks used prefix, this would raise a HunspellFilePathError Hunspell('test', system_encoding='UTF-8') with open(caperr, 'r') as err: # But the Hunspell library lookup had the prefix applied self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*')
def search_by_dictionary(speller:Hunspell, plural: str) -> Union[None, SearchResult]: for e in __ending_pairs: for key in e.keys(): if plural.endswith(key): suggestion = plural[0:0-len(key)] + e[key] if speller.spell(suggestion): return SearchResult(suggestion, key, e[key]) return None
def test_clear_caches_non_peristance(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem hunspell.clear_cache() del hunspell hunspell = Hunspell('test', hunspell_data_dir=DICT_DIR) assert hunspell.suggest('made-up') != test_suggest assert hunspell.suffix_suggest('made-up') != test_suffix assert hunspell.stem('made-up') != test_stem
def test_hunspell_bulk_suggest(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertDictEqual(d.bulk_action("suggest", ['dog', 'dpg']), { 'dpg': ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP'], 'dog': ['dog'] }) self.assertDictEqual(d.bulk_action("suggest", ['dog', 'dpg', 'pgg', 'opg', 'dyg', 'frg', 'twg', 'bjn', 'foo', 'qre']), { 'pgg': ['pg', 'peg', 'egg', 'pig', 'pug', 'pkg', 'pg g', 'PG'], 'foo': ['few', 'goo', 'fop', 'foot', 'fool', 'food', 'foe', 'for', 'fro', 'too', 'fol', 'coo', 'fog', 'moo', 'fob'], 'frg': ['fr', 'frig', 'frog', 'erg', 'fig', 'f*g', 'fro', 'fog', 'fry', 'fr g'], 'twg': ['twig', 'tag', 'two', 'tog', 'tug', 'twp'], 'bjn': ['bin', 'ban', 'bun', 'Bjorn'], 'dog': ['dog'], 'dpg': ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP'], 'opg': ['op', 'pg', 'ope', 'ops', 'opt', 'mpg', 'opp', 'o pg', 'op g', 'GPO'], 'dyg': ['dug', 'dye', 'deg', 'dig', 'dog', 'dying'], 'qre': ['qr', 're', 'ere', 'ire', 'are', 'ore', 'Ore', 'Dre', 'q re', 'qr e'] }) del d
def test_hunspell_bulk_suggest(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertDictEqual( d.bulk_action("suggest", ['dog', 'dpg']), { 'dpg': [ 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP' ], 'dog': ['dog'] }) self.assertDictEqual( d.bulk_action("suggest", [ 'dog', 'dpg', 'pgg', 'opg', 'dyg', 'frg', 'twg', 'bjn', 'foo', 'qre' ]), { 'pgg': ['pg', 'peg', 'egg', 'pig', 'pug', 'pkg', 'pg g', 'PG'], 'foo': [ 'few', 'goo', 'fop', 'foot', 'fool', 'food', 'foe', 'for', 'fro', 'too', 'fol', 'coo', 'fog', 'moo', 'fob' ], 'frg': [ 'fr', 'frig', 'frog', 'erg', 'fig', 'f*g', 'fro', 'fog', 'fry', 'fr g' ], 'twg': ['twig', 'tag', 'two', 'tog', 'tug', 'twp'], 'bjn': ['bin', 'ban', 'bun', 'Bjorn'], 'dog': ['dog'], 'dpg': [ 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP' ], 'opg': [ 'op', 'pg', 'ope', 'ops', 'opt', 'mpg', 'opp', 'o pg', 'op g', 'GPO' ], 'dyg': ['dug', 'dye', 'deg', 'dig', 'dog', 'dying'], 'qre': [ 'qr', 're', 'ere', 'ire', 'are', 'ore', 'Ore', 'Dre', 'q re', 'qr e' ] }) del d
def __stem(speller: Hunspell, plural: str) -> [str]: stems = list() for stem in speller.stem(plural): stem = stem.replace("ij", "ij") if len(plural) - len(stem) <= 3: ps = pluralize(stem) if ps == plural: stems.append(stem) return stems
def __init__(self, threshold=0.96): basename = os.path.dirname(os.path.realpath(__file__)) self.lm = LanguageModel() # Load spaCy self.nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. self.gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ self.gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners self.determiners = {"", "the", "a", "an"} # List of common prepositions self.prepositions = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } self.threshold = threshold
def leetScan(string, valDict, language="EN"): leetcandidates = [] count = 0 h = Hunspell('en_US', hunspell_data_dir='/Library/Spelling') tokens = nltk.word_tokenize(string) # Calculate Total Words in string total_words = len(tokens) for token in tokens: # Check for misspelling if h.spell(token) == False: # See if word contains leet if leetCheck(token): # Add to possible candidate list leetcandidates.append(token) # Test candidate list for word validity using swapping for candidate in leetcandidates: if swapValid(candidate, valDict, h): count = count + 1 fraction = Fraction(count, total_words) return fraction
def hunspell(self) -> Hunspell: """ Returns the (cached) Hunspell instance """ if not self._hunspell: self._hunspell = Hunspell( self.lang.get_hunspell_dict_name(), hunspell_data_dir=self.hunspell_data_dir, ) return self._hunspell
def pluralize_advanced( singular: str, speller: Hunspell = None, ending_overrides: NounEndingMap = None) -> AdvancedPluralizationResult: if not speller: speller = ensure_hunspell_nl() plural = __pluralize(singular, ending_overrides) # empty plural - just stop if not plural: return AdvancedPluralizationResult(plural, None, (), None, None, False) # right spelled plural if speller.spell(plural): return AdvancedPluralizationResult(plural, plural, (), None, None, True) # if no rightly spelled word can be found, use suggestions, # replacement of the endings and the Hunspell dictionary if # we can find something that is spelled correctly. suggestions = speller.suggest(plural) search_result:SearchResult = \ search_by_suggestions(plural, suggestions) or \ search_by_dictionary(speller, plural) or \ search_by_dictionary_plus_s(speller, singular) if search_result: return AdvancedPluralizationResult(plural, search_result.plural, suggestions, search_result.switched_ending_from, search_result.switched_ending_to, True) return AdvancedPluralizationResult(plural, None, (), None, None, False)
class SpellChecker: def __init__(self, language='en_US', hunspell_data_dir='./hunspell', n_jobs=1): SpellChecker.get_dict(language, hunspell_data_dir) self.hunspell = Hunspell(language, hunspell_data_dir=hunspell_data_dir, disk_cache_dir=os.path.join( hunspell_data_dir, 'cache')) self.hunspell.set_concurrency(n_jobs) self.substitutes = dict() def spell_check(self, tokenized_corpus_2d): tokens = {t for iterable in tokenized_corpus_2d for t in iterable} new_tokens = tokens - self.substitutes.keys() correct_tokens = {t for t in new_tokens if self.hunspell.spell(t)} self.substitutes.update(map(lambda t: (t, t), correct_tokens)) tokens_to_check = new_tokens - correct_tokens suggestions = self.hunspell.bulk_suggest(tokens_to_check) self.substitutes.update( map(lambda kv: (kv[0], kv[0]) if not kv[1] else (kv[0], kv[1][0]), suggestions.items())) new_corpus = [[self.substitutes[token] for token in iterable] for iterable in tokenized_corpus_2d] return new_corpus @staticmethod def get_dict(language, data_dir): os.makedirs(data_dir, exist_ok=True) for ext in ['aff', 'dic']: path = os.path.join(data_dir, '%s.%s' % (language, ext)) if not os.path.exists(path): r = get( 'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s' % (language, language, ext)) if r.status_code == 404: l = language[0:language.find('_')] r = get( 'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s' % (l, language, ext)) r.raise_for_status() f = open(path, 'wb') f.write(r.content) f.close() def __del__(self): self.hunspell.save_cache() # For future program executions.
def test_hunspell_suggest(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertListEqual(d.suggest('dpg'), ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP']) del d
def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
def test_hunspell_spell(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertFalse(d.spell('dpg')) self.assertTrue(d.spell('dog')) del d
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) })
def test_hunspell_stem(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertListEqual(d.stem('dog'), ['dog']) self.assertListEqual(d.stem('permanently'), ['permanent']) del d