class HunspellChecker(object): def __init__(self): self.checker = Hunspell() self.stopwords = set(SW.words("english")) | set(string.punctuation) def correct_word(self, word): """Borrowed from: https://datascience.blog.wzb.eu/2016/07/13/autocorrecting-misspelled-words-in-python-using-hunspell/ """ ok = self.checker.spell(word) # check spelling if not ok: suggestions = self.checker.suggest(word) if len(suggestions) > 0: # there are suggestions return suggestions[0] else: return word else: return word def correct_string(self, text, ensure_length=False): """Break into words and correct each word.""" tokens = text.split() corrected = [] for token in tokens: if token in self.stopwords: corrected.append(token) else: correction = self.correct_word(token) if ensure_length: corrected.append(correction.split()[0]) else: corrected.append(correction) return " ".join(corrected)
class HunSpellCheckerClass: """ Check the spelling of a word. """ def __init__(self): """ Set up for the checking the spelling of a word. """ debug('Initializing Hunspell') self.word_check = Hunspell() # config_list = self.word_check.ConfigKeys() # # print(config_list:'encoding') # for config_item in config_list: # print('\n', config_item, config_list[config_item]) def check_word(self, test_word: str) -> bool: """ Check a word to see if it is spelled correctly. Note: It appears that a lot of abbreviations are in the aspell dictionary, such as 'ac' and 'cf'. I will just have to manually weed them out with the ole Mark One eyeball. :) :param test_word: word to check :return: true if spelled ok or false if not a valid word """ debug(f'check_word received {test_word}') result = self.word_check.spell(test_word.lower()) debug(f'check_word result {result}') return result
def spell_corrector(df, lang1, lang2): #Create an object of the Hunspell class h = Hunspell() print('I am spell_checker') #An empty list to hold the corrected sentences which would later be made into a dataframe corr_sent_list = {'L1': [], 'L2': []} #For each sentence in the dataframe for sent in df['L1']: #Empty string to which the corrected words are appended corr_sent = '' #For every word in the sentence. Which is split by word boundary for w in re.split(r'\b', sent): #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent if not w.isalpha() or h.spell(w): corr_sent += w #If the split part is word and is incorrect else: #Suggest possible correct candidates to the incorrect word suggest = h.suggest(w) #If more than one word is suggested, more processing is required to select a word if len(suggest) > 1: #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word corr_sent += suggest[0] #If only one word is suggested, append it to corr_sent else: corr_sent += suggest[0] #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list corr_sent_list['L1'].append(corr_sent) #Convert the corrected sentences list into pandas dataframe to return if lang2 is not None: corr_sent_list['L2'].extend(list(df['L2'])) return pd.DataFrame.from_dict(corr_sent_list) else: return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])
def create_fallback_options(singular: str, speller: Hunspell, options: [str], stems: [str]) -> AdvancedSingularizationResult: options.append(singular) if speller.spell(singular): return AdvancedSingularizationResult(options, singular, stems, True, True) else: return AdvancedSingularizationResult(options, None, stems, False, True)
def search_by_dictionary_plus_s(speller: Hunspell, singular: str) -> Union[None, SearchResult]: plural = singular + 's' if speller.spell(plural): return SearchResult(plural, None,"s") return None
def search_by_dictionary(speller:Hunspell, plural: str) -> Union[None, SearchResult]: for e in __ending_pairs: for key in e.keys(): if plural.endswith(key): suggestion = plural[0:0-len(key)] + e[key] if speller.spell(suggestion): return SearchResult(suggestion, key, e[key]) return None
def new_hunspell_nl() -> Hunspell: dictionary_path = __resolve_path("../dict/") hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path)) # add words that are not present in current dictionary for list in [get_plural_nouns(), get_basic_words()]: for word in list: if not hnspl.spell(word): hnspl.add(word) return hnspl
def run(self) -> Generator[Tuple[int, int, str, type], None, None]: """Run the linter and return a generator of errors.""" with open(self.filename, 'r') as file: comments = get_comments(file.read()) # for comment in comments z = list(comments) spell = Hunspell() x = spell.spell(z[1][2][0]) print(x) yield (0, 0, f'KOL001 Bad language found: ', TypoChecker)
class HunspellChecker(object): def __init__(self): self.checker = Hunspell() def correct(self, word): if self.checker.spell(word) == True: return word else: res = self.checker.suggest(word) if res: return res[0] else: return word
class SpellChecker: def __init__(self, language='en_US', hunspell_data_dir='./hunspell', n_jobs=1): SpellChecker.get_dict(language, hunspell_data_dir) self.hunspell = Hunspell(language, hunspell_data_dir=hunspell_data_dir, disk_cache_dir=os.path.join( hunspell_data_dir, 'cache')) self.hunspell.set_concurrency(n_jobs) self.substitutes = dict() def spell_check(self, tokenized_corpus_2d): tokens = {t for iterable in tokenized_corpus_2d for t in iterable} new_tokens = tokens - self.substitutes.keys() correct_tokens = {t for t in new_tokens if self.hunspell.spell(t)} self.substitutes.update(map(lambda t: (t, t), correct_tokens)) tokens_to_check = new_tokens - correct_tokens suggestions = self.hunspell.bulk_suggest(tokens_to_check) self.substitutes.update( map(lambda kv: (kv[0], kv[0]) if not kv[1] else (kv[0], kv[1][0]), suggestions.items())) new_corpus = [[self.substitutes[token] for token in iterable] for iterable in tokenized_corpus_2d] return new_corpus @staticmethod def get_dict(language, data_dir): os.makedirs(data_dir, exist_ok=True) for ext in ['aff', 'dic']: path = os.path.join(data_dir, '%s.%s' % (language, ext)) if not os.path.exists(path): r = get( 'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s' % (language, language, ext)) if r.status_code == 404: l = language[0:language.find('_')] r = get( 'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s' % (l, language, ext)) r.raise_for_status() f = open(path, 'wb') f.write(r.content) f.close() def __del__(self): self.hunspell.save_cache() # For future program executions.
def leetScan(string, valDict, language="EN"): leetcandidates = [] count = 0 h = Hunspell('en_US', hunspell_data_dir='/Library/Spelling') tokens = nltk.word_tokenize(string) # Calculate Total Words in string total_words = len(tokens) for token in tokens: # Check for misspelling if h.spell(token) == False: # See if word contains leet if leetCheck(token): # Add to possible candidate list leetcandidates.append(token) # Test candidate list for word validity using swapping for candidate in leetcandidates: if swapValid(candidate, valDict, h): count = count + 1 fraction = Fraction(count, total_words) return fraction
def pluralize_advanced( singular: str, speller: Hunspell = None, ending_overrides: NounEndingMap = None) -> AdvancedPluralizationResult: if not speller: speller = ensure_hunspell_nl() plural = __pluralize(singular, ending_overrides) # empty plural - just stop if not plural: return AdvancedPluralizationResult(plural, None, (), None, None, False) # right spelled plural if speller.spell(plural): return AdvancedPluralizationResult(plural, plural, (), None, None, True) # if no rightly spelled word can be found, use suggestions, # replacement of the endings and the Hunspell dictionary if # we can find something that is spelled correctly. suggestions = speller.suggest(plural) search_result:SearchResult = \ search_by_suggestions(plural, suggestions) or \ search_by_dictionary(speller, plural) or \ search_by_dictionary_plus_s(speller, singular) if search_result: return AdvancedPluralizationResult(plural, search_result.plural, suggestions, search_result.switched_ending_from, search_result.switched_ending_to, True) return AdvancedPluralizationResult(plural, None, (), None, None, False)
def singularize_advanced( plural: str, speller: Hunspell = None, ending_overrides: NounEndingMap = None ) -> AdvancedSingularizationResult: if not could_be_plural(plural, ending_overrides): return AdvancedSingularizationResult(None, None, (), False, False) options = __process_methods( plural, lambda l: singularize_by_hard_map(l, ending_overrides ), # should always be first! singularize_oren, singularize_eren, singularize_by_latin, singularize_with_s, singularize_with_trema_en, singularize_with_en_single_vowel, singularize_with_en_double_vowel, singularize_with_en_double_consonant) # debug # print("options", options) if not speller: speller = ensure_hunspell_nl() stems = __stem(speller, plural) # return option that is spelled correct for option in options: if speller.spell(option): return AdvancedSingularizationResult(options, option, (), True, True) stems = __stem(speller, plural) # debug # print("options", options, "stems", stems) if stems: return AdvancedSingularizationResult(options, stems[0], stems, True, True) for ending in ["'s", "s"]: if plural.endswith(ending): singular = plural[0:0 - len(ending)] return create_fallback_options(singular, speller, options, stems) if plural.endswith("en"): singular = plural[0:-2] if singular.endswith("v"): singular = singular[0:-1] + "f" elif singular.endswith("z"): singular = singular[0:-1] + "s" return create_fallback_options(singular, speller, options, stems) return AdvancedSingularizationResult(options, None, stems, False, True)
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog', )) self.assertEqual(self.h.stem('permanently'), ('permanent', )) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog', ), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = [ 'bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg' ] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent', ), 'dog': ('dog', ) }) self.assertDictEqual( self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded', ), 'permanently': ('permanent', ), 'twigs': ('twig', ), 'dog': ('dog', ) })
class SpellChecker: """ Class for managing spell checking using Hunspell. Implemented as a class, as multiple instances of a SpellChecker might be used to maintain different dictionaries simultaneously (for example adding custom words). """ def __init__(self, allowed_punctuation_marks, dictionary_directory): """ Constructor method. Declares and creates a new Hunspell object. """ self.allowed_punctuation_marks = allowed_punctuation_marks self.dictionary_directory = dictionary_directory self.hunspell = None self.refresh_dict() def refresh_dict(self): """ Create a new Hunspell object from the specified dictionary file. """ self.hunspell = Hunspell('index', hunspell_data_dir=self.dictionary_directory) def is_punctuation_mark(self, word): """ Checks if the given word corresponds to one of the allowed punctuation marks. :param word: a string with a single word :type: string :return: boolean indicating if the given word is an allowed punctuation mark :type: boolean """ return bool(re.match(r'[%s]' % self.allowed_punctuation_marks, word)) def is_correctly_spelled(self, word): """ Checks if the given word is correctly spelled. :param word: a string with a single word :type: string :return: boolean indicating if the spelling of the word is correct :type: boolean """ return self.hunspell.spell(word) def suggest(self, word): """ Suggest similar and correctly spelled alternatives for the given string. Orders Hunspell suggestions by edit distance. :param word: a string with a single word :type: string :return: a list of suggestions :type: list<string> """ suggestions = self.hunspell.suggest(word) return sorted(suggestions, key=lambda suggestion: edit_distance(word, suggestion)) def fix(self, word): """ Fixes the spelling of the given word. :param word: a string with a single word :type: string :return: the same word if correctly spelled or a punctuation mark, otherwise the top Hunspell suggestion. """ return word if self.is_punctuation_mark( word) or self.is_correctly_spelled(word) else self.suggest(word)[0] def fix_text(self, text): """ Fixes the spelling of a multi-worded phrase. :param text: the phrase string :type: string :return: the same phrase, with the spelling of each word fixed. """ fixed_text = ' '.join([self.fix(word) for word in word_tokenize(text)]) return re.sub(r' ([%s])' % self.allowed_punctuation_marks, r'\1', fixed_text) # remove spaces preceding punctuation
class WordScapeSolver: def __init__(self): self.h = Hunspell("en_US", "en_US") def solve_wordscape_helper(self, valid_words, letters, length): p = set(itertools.permutations(letters, length)) for raw_string in p: word = "".join(raw_string) if self.h.spell(word): valid_words.append(word) def try_fit(self, board, i, j, horizontal, word, length, valid_words): if horizontal: for k in range(length): if board.content[i][j + k].c == ".": board.content[i][j + k].save(True) board.content[i][j + k].c = word[k] elif board.content[i][j + k].c == word[k]: board.content[i][j + k].save(False) else: for l in range(k): board.content[i][j + l].restore() return False new_word_list = valid_words.copy() new_word_list.remove(word) if verbose: print(board) success = self.try_solve(board, i, j, new_word_list) if not success: for l in range(length): board.content[i][j + l].restore() return False else: return True else: for k in range(length): if board.content[i + k][j].c == ".": board.content[i + k][j].save(True) board.content[i + k][j].c = word[k] elif board.content[i + k][j].c == word[k]: board.content[i + k][j].save(False) else: for l in range(k): board.content[i + l][j].restore() return False new_word_list = valid_words.copy() new_word_list.remove(word) if verbose: print(board) success = self.try_solve(board, i, j, new_word_list) if not success: for l in range(length): board.content[i + l][j].restore() return False else: return True def find_unfilled_word(self, board, i, j, horizontal): empty_spot = False if horizontal: length = 0 for k in range(j, MAX_BOARD_SIZE): if not empty_spot and board.content[i][k].c == ".": empty_spot = True if board.content[i][k].c == "": exist = empty_spot and length > 1 return exist, length length += 1 else: length = 0 for k in range(i, MAX_BOARD_SIZE): if not empty_spot and board.content[k][j].c == ".": empty_spot = True if board.content[k][j].c == "": exist = empty_spot and length > 1 return exist, length length += 1 return False, 0 def find_next_word(self, board, i, j): for p in range(MAX_BOARD_SIZE): if p < i: continue for q in range(MAX_BOARD_SIZE): if p == i and q < j: continue exist, length = self.find_unfilled_word(board, p, q, True) if exist: return False, p, q, True, length exist, length = self.find_unfilled_word(board, p, q, False) if exist: return False, p, q, False, length return True, 0, 0, True, 0 def try_solve(self, board, i, j, valid_words): finished, i, j, horizontal, length = self.find_next_word(board, i, j) if finished: return True candidates = [] for word in valid_words: if len(word) == length: candidates.append(word) while len(candidates) > 0: if self.try_fit(board, i, j, horizontal, candidates.pop(), length, valid_words): return True return False def solve_board(self, valid_words, board): return self.try_solve(board, 0, 0, valid_words) def solve(self, letters, length, board=None): letters = letters.lower() valid_words = [] for i in range(length, len(letters) + 1): self.solve_wordscape_helper(valid_words, letters, i) for word in valid_words: print(word) #valid_words = if not board is None: if self.solve_board(valid_words, board): print(board) else: print("No Solutions!")
dtype={ 'Id': str, 'EssaySet': str, 'essay_score1': np.int32, 'essay_score2': np.int32, 'EssayText': str }) df['error rate'] = np.nan for index, row in df.iterrows(): if type(row['EssayText']) is float: df.at[index, 'error rate'] = 100 continue token_list = row['EssayText'].split() count_error = 0 for t in token_list: if spell.spell(t) is False: count_error += 1 if count_error != 0: df.at[index, 'error rate'] = count_error / len(token_list) * 100 else: df.at[index, 'error rate'] = 0 random_char = df.iloc[:1000] random_word = df.iloc[1000:2000] brown_char_ngram = df.iloc[2000:7000] brown_word_ngram = df.iloc[7000:12000] asap_char_ngram = df.iloc[12000:17000] asap_word_ngram = df.iloc[17000:22000] content_burst = df.iloc[22000:23000] shuffle = df.iloc[23000:24000] gpt_2 = df.iloc[24000:25001]
def test_hunspell_spell(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertFalse(d.spell('dpg')) self.assertTrue(d.spell('dog')) del d
class UnsupervisedGrammarCorrector: def __init__(self, threshold=0.96): basename = os.path.dirname(os.path.realpath(__file__)) self.lm = LanguageModel() # Load spaCy self.nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. self.gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ self.gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners self.determiners = {"", "the", "a", "an"} # List of common prepositions self.prepositions = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } self.threshold = threshold def correct(self, sentence): # If the line is empty, preserve the newline in output and continue if not sentence: return "" best = sentence score = self.lm.score(best) while True: new_best, new_score = self.process(best) if new_best and new_score > score: best = new_best score = new_score else: break return best def process(self, sentence: str) -> Tuple[str, bool]: # Process sent with spacy proc_sent = self.nlp.tokenizer(sentence) self.nlp.tagger(proc_sent) # Calculate avg token prob of the sent so far. orig_prob = self.lm.score(proc_sent.text) # Store all the candidate corrected sentences here candidates = [] # Process each token. for tok in proc_sent: # SPELLCHECKING # Spell check: tok must be alphabetical and not a real word. candidate_tokens = set() lower_cased_token = tok.lower_ if lower_cased_token.isalpha( ) and not self.gb.spell(lower_cased_token): candidate_tokens |= set(self.gb.suggest(lower_cased_token)) # MORPHOLOGY if tok.lemma_ in self.gb_infl: candidate_tokens |= self.gb_infl[tok.lemma_] # DETERMINERS if lower_cased_token in self.determiners: candidate_tokens |= self.determiners # PREPOSITIONS if lower_cased_token in self.prepositions: candidate_tokens |= self.prepositions candidate_tokens = [ c for c in candidate_tokens if self.gb.spell(c) ] if candidate_tokens: if tok.is_title: candidate_tokens = [c.title() for c in candidate_tokens] elif tok.is_upper: candidate_tokens = [c.upper() for c in candidate_tokens] candidates.extend( self._generate_candidates(tok.i, candidate_tokens, proc_sent)) best_prob = orig_prob best = sentence for candidate in candidates: # Score the candidate sentence cand_prob = self.lm.score(candidate.text) print(candidate.text, self.lm.score(candidate.text), cand_prob) # Compare cand_prob against weighted orig_prob and best_prob if cand_prob > best_prob: best_prob = cand_prob best = candidate.text # Return the best sentence and a boolean whether to search for more errors return best, best_prob def _generate_candidates(self, tok_id, candidate_tokens, tokenized_sentence) -> List[str]: # Save candidates here. candidates = [] prefix = tokenized_sentence[:tok_id] suffix = tokenized_sentence[tok_id + 1:] # Loop through the input alternative candidates for token in candidate_tokens: candidate = prefix.text_with_ws if token: candidate += token + " " candidate += suffix.text_with_ws candidate = self.nlp.tokenizer(candidate) candidates.append(candidate) return candidates
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) })
class HunspellTest(unittest.TestCase): def assertRegexpSearch(self, *args, **kwargs): if PY3: self.assertRegex(*args, **kwargs) else: self.assertRegexpMatches(*args, **kwargs) def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(HunspellFilePathError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) @patch('os.path.isfile', return_value=True) @patch('os.access', return_value=True) def test_bad_path_encoding(self, *mocks): if PY3: with self.assertRaises(HunspellFilePathError): Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') else: # Python 2 just make an illegal string instead of raising with captured_c_stderr_file() as caperr: Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') with open(caperr, 'r') as err: self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding') @patch('hunspell.hunspell.WIN32_LONG_PATH_PREFIX', '/not/valid') def test_windows_utf_8_encoding_applies_prefix(self, *mocks): with captured_c_stderr_file() as caperr: with patch("os.name", 'nt'): # If python file existance checks used prefix, this would raise a HunspellFilePathError Hunspell('test', system_encoding='UTF-8') with open(caperr, 'r') as err: # But the Hunspell library lookup had the prefix applied self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*') def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_add(self): word = 'outofvocabularyword' self.assertEqual(self.h.spell(word), False) self.h.add(word) self.assertEqual(self.h.spell(word), True) typo = word + 'd' self.assertAllIn([word], self.h.suggest(typo)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) }) def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) def test_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertEqual(self.h.suggest('made-up'), test_suggest) self.assertEqual(self.h.stem('made-up'), test_stem) def test_save_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertNotEqual(len(h2._suggest_cache), 0) self.assertNotEqual(len(h2._stem_cache), 0) self.assertEqual(h2.suggest('made-up'), test_suggest) self.assertEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_non_peristance(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) self.h.clear_cache() del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertNotEqual(self.h.suggest('made-up'), test_suggest) self.assertNotEqual(self.h.stem('made-up'), test_stem)
class Stem(): """The Stem class deals with various tasks as follows: - spell error detection and correction - morphological analysis - stemming These tasks are carried out in the `Kurdish Hunspell project <https://github.com/sinaahmadi/KurdishHunspell>`_. """ def __init__(self, dialect, script): self.hunspell_flags = { "po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation" } if dialect == "Sorani" and script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: raise Exception( "Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!" ) # def stem(self, word): # """A function for stemming a single word""" # pass # def lemmatize(self, word): # """A function for lemmatization of a single word""" # pass def check_spelling(self, word): """Check spelling of a word Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: bool: True if the spelling is correct, False if the spelling is incorrect """ if not isinstance(word, str): raise TypeError("Only a word (str) is allowed.") else: return self.huns.spell(word) def correct_spelling(self, word): """Correct spelling errors if the input word is incorrect Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: tuple (boolean, list): a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect). If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []). If no suggestion is available, the list is returned empty as (True, []). """ if not isinstance(word, str): raise TypeError("Only a word (str) is allowed.") else: if self.check_spelling(word): return (True, []) return (False, list(self.huns.suggest(word))) def analyze(self, word_form): """Morphological analysis of a given word More details regarding Kurdish morphological analysis can be found at https://github.com/sinaahmadi/KurdishHunspell Args: word_form (str): a single word-form Raises: TypeError: only string as input Returns: (list(dict)): a list of all possible morphological analyses according to the defined morphological rules The morphological analysis is returned as a dictionary as follows: - "pos": the part-of-speech of the word-form according to `the Universal Dependency tag set <https://universaldependencies.org/u/pos/index.html>`_ - "description": is flag - "terminal_suffix": anything except ts flag - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure. - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to `the Hunspell documentation <http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html>`_, "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base. If the input cannot be analyzed morphologically, an empty list is returned. """ if not isinstance(word_form, str): raise TypeError("Only a word (str) is allowed.") else: # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary word_analysis = list() for analysis in list(self.huns.analyze(word_form)): analysis_dict = dict() for item in analysis.split(): if ":" not in item: continue if item.split(":")[1] == "ts": # ts flag exceptionally appears after the value as value:key in the Hunspell output analysis_dict["base"] = item.split(":")[0] # anything except the terminal_suffix is considered to be the base analysis_dict[self.hunspell_flags[item.split( ":")[1]]] = word_form.replace( item.split(":")[0], "") elif item.split(":")[0] in self.hunspell_flags.keys(): # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function # for ds flag, add derivation as the formation type, otherwise inflection if item.split(":")[0] == "ds": analysis_dict[self.hunspell_flags[item.split( ":")[0]]] = "derivational" analysis_dict[ self.hunspell_flags["is"]] = item.split(":")[1] else: analysis_dict[self.hunspell_flags[item.split( ":")[0]]] = item.split(":")[1] # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0 if self.hunspell_flags[ "ts"] not in analysis_dict or analysis_dict[ self.hunspell_flags["ts"]] == "": analysis_dict[self.hunspell_flags["ts"]] = "0" word_analysis.append(analysis_dict) return word_analysis
def clean_message(message, replacement): m = message m = re.sub( r'\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*', replacement, m) return m # Removes highway numbers # Removes hyphens for times # Removes references to 7/11 (screws up a lot of current time & date detection) # Removes spaces between number and am/pm (helps with time detection) def process_extra(tokens): t = tokens t = [token for token in t if not re.match(r'(400|401|403|404|407)', token)] t = [re.sub(r'(\d+)-[\d:]+', r'\1', token) for token in t] m = ' '.join(t) m = re.sub(r' (7 11|7-11)', r"", m) m = re.sub(r"([0-9]) (am|pm)", r"\1\2", m) return m.split(' ') if __name__ == "__main__": print( process( "Offering: Jun.3 Sunday 8pm Waterloo (Burger King) -> Mississauga Square One $10/Pearson Airport $40, text 5197211776" )) print(hspell.spell('rhill'))
from hunspell import Hunspell h = Hunspell("ko", hunspell_data_dir='ko') if __name__ == "__main__": answer = h.spell("안녕하세요으") print(answer) answer2 = h.spell("안녕하세") print(answer2) answer3 = h.suggest("안녕하세요으") print(answer3)
class Stem: """ The Stem module deals with various tasks, mainly through the following functions: - `check_spelling`: spell error detection - `correct_spelling`: spell error correction - `analyze`: morphological analysis Please note that only Sorani is supported in this version in this module. The module is based on the [Kurdish Hunspell project](https://github.com/sinaahmadi/KurdishHunspell). Example: ```python >>> from klpt.stem import Stem >>> stemmer = Stem("Sorani", "Arabic") >>> stemmer.check_spelling("سوتاندبووت") False >>> stemmer.correct_spelling("سوتاندبووت") (False, ['ستاندبووت', 'سووتاندبووت', 'سووڕاندبووت', 'ڕووتاندبووت', 'فەوتاندبووت', 'بووژاندبووت']) >>> stemmer.analyze("دیتبامن") [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}] ``` """ def __init__(self, dialect, script): self.dialect = dialect self.script = script self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"} if self.dialect == "Sorani" and self.script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: if not (self.dialect == "Kurmanji" and self.script == "Latin"): raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!") # def stem(self, word): # """A function for stemming a single word""" # pass # def lemmatize(self, word): # """A function for lemmatization of a single word""" # pass def check_spelling(self, word): """Check spelling of a word Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: bool: True if the spelling is correct, False if the spelling is incorrect """ if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"): raise TypeError("Not supported yet.") else: return self.huns.spell(word) def correct_spelling(self, word): """ Correct spelling errors if the input word is incorrect. It returns a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect). If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []). If no suggestion is available, the list is returned empty as (True, []). Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: tuple (boolean, list) """ if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"): raise TypeError("Not supported yet.") else: if self.check_spelling(word): return (True, []) return (False, list(self.huns.suggest(word))) def analyze(self, word_form): """ Morphological analysis of a given word. It returns morphological analyses. The morphological analysis is returned as a dictionary as follows: - "pos": the part-of-speech of the word-form according to [the Universal Dependency tag set](https://universaldependencies.org/u/pos/index.html). - "description": is flag - "terminal_suffix": anything except ts flag - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure. - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to [the Hunspell documentation](http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html), "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base. As in [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}] If the input cannot be analyzed morphologically, an empty list is returned. Sorani: More details regarding Sorani Kurdish morphological analysis can be found at [https://github.com/sinaahmadi/KurdishHunspell](https://github.com/sinaahmadi/KurdishHunspell). Kurmanji: Regarding Kurmanji, we use the morphological analyzer provided by the [Kurmanji part](https://github.com/apertium/apertium-kmr) Please note that there are delicate difference between who the analyzers work in Hunspell and Apertium. For instane, the `base` in the Kurmanji analysis refers to the lemma while in Sorani (from Hunspell), it refers to the morphological base. Args: word_form (str): a single word-form Raises: TypeError: only string as input Returns: (list(dict)): a list of all possible morphological analyses according to the defined morphological rules """ if not isinstance(word_form, str): raise TypeError("Only a word (str) is allowed.") else: word_analysis = list() if self.dialect == "Sorani" and self.script == "Arabic": # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary for analysis in list(self.huns.analyze(word_form)): analysis_dict = dict() for item in analysis.split(): if ":" not in item: continue if item.split(":")[1] == "ts": # ts flag exceptionally appears after the value as value:key in the Hunspell output analysis_dict["base"] = item.split(":")[0] # anything except the terminal_suffix is considered to be the base analysis_dict[self.hunspell_flags[item.split(":")[1]]] = word_form.replace(item.split(":")[0], "") elif item.split(":")[0] in self.hunspell_flags.keys(): # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function # for ds flag, add derivation as the formation type, otherwise inflection if item.split(":")[0] == "ds": analysis_dict[self.hunspell_flags[item.split(":")[0]]] = "derivational" analysis_dict[self.hunspell_flags["is"]] = item.split(":")[1] else: analysis_dict[self.hunspell_flags[item.split(":")[0]]] = item.split(":")[1] # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0 if self.hunspell_flags["ts"] not in analysis_dict or analysis_dict[self.hunspell_flags["ts"]] == "": analysis_dict[self.hunspell_flags["ts"]] = "0" word_analysis.append(analysis_dict) elif self.dialect == "Kurmanji" and self.script == "Latin": att_analysis = Analysis("Kurmanji", "Latin").analyze(word_form) # check if the word-form is analyzed or no if not len(att_analysis[1]): # the word-form could not be analyzed return [] for form_analysis in list(att_analysis[-1]): for analysis in form_analysis: analysis_dict = dict() structure = analysis[0].rsplit('@', 1)[1].split("<", 1) analysis_dict["base"], analysis_dict["description"] = structure[0], structure[1].replace("><", "_").replace(">", "").strip() analysis_dict["pos"] = "" analysis_dict["terminal_suffix"] = "" analysis_dict["formation"] = "" # TODO: the description needs further information extraction in such a way that some values should be assigned to the "pos" key # analysis_dict["terminal_suffix"] = word_form.replace(analysis_dict["base"], "") word_analysis.append(analysis_dict) return word_analysis
class CyHunspell(): ''' Спеллер на основе cython версии hunspell >>> word_en = 'cookbok' >>> word_ru = 'поваринная' >>> speller_en = CyHunspell(lang="en") >>> speller_en.spell(word_en) False >>> speller_en.suggest(word_en) ('cookbook', 'copybook', 'codebook', 'Cook', 'cook') >>> speller_en.replace(word_en) 'cookbook' >>> speller_ru = CyHunspell(lang="ru") >>> speller_ru.spell(word_ru) False >>> speller_ru.suggest(word_ru) ('поваренная',) >>> speller_ru.replace(word_ru) 'поваренная' ''' langs = {'ru': 'ru_RU', 'en': 'en_US'} def __init__( self, lang='en', max_dist=2, cpu=os.cpu_count(), # cache_manager="hunspell",disk_cache_dir=None, # hunspell_data_dir=None,system_encoding=None spell_kwargs={}): self.lang = self.langs.get(lang, lang) self.spell_dict = Hunspell(self.lang, **spell_kwargs) self.max_dist = max_dist self.spell_dict.set_concurrency(cpu) def spell(self, word): try: result = self.spell_dict.spell(word) except UnicodeEncodeError as err: result = None return result def suggest(self, word): try: result = self.spell_dict.suggest(word) except UnicodeEncodeError as err: result = tuple() return result def replace(self, word, max_dist=None): max_dist = max_dist if max_dist is not None else self.max_dist if self.spell(word): return word suggestions = self.suggest(word) if (suggestions and edit_distance(word, suggestions[0]) <= max_dist): return suggestions[0] else: return word