class HunSpellTest(unittest.TestCase): def setUp(self): self.hunspell = HunSpell("/usr/share/hunspell/en_US.dic", "/usr/share/hunspell/en_US.aff") def tearDown(self): try: del self.hunspell except AttributeError: pass def test_hunspell_spell(self): self.assertFalse(self.hunspell.spell('dpg')) self.assertTrue(self.hunspell.spell('dog')) self.assertFalse(self.hunspell.spell('spookie')) self.assertTrue(self.hunspell.spell('spooky')) def test_hunspell_suggest(self): self.assertEqual(self.hunspell.suggest('dpg'), [b'dog', b'pg', b'deg', b'dig', b'dpt', b'dug', b'mpg', b'd pg', b'GDP', b'DP', b'PG', b'DTP', b'dip']) self.assertEqual(self.hunspell.suggest('spookie'), [b'spookier', b'spookiness', b'spook', b'cookie', b'bookie', b'Spokane', b'spoken']) def test_hunspell_stem(self): self.assertEqual(self.hunspell.stem('dog'), [b'dog']) self.assertEqual(self.hunspell.stem('permanently'), [b'permanent']) self.assertEqual(self.hunspell.stem('linked'), [b'linked', b'link']) def test_analyze(self): self.assertEqual(self.hunspell.analyze('linked'), [b' st:linked', b' st:link fl:D'])
def stem(word): hunspell_object = HunSpell(DIC_FILE, AFF_FILE) stemmed_list = hunspell_object.stem(word) if len(stemmed_list) > 0: return stemmed_list[0] else: sys.exit(1)
class HunSpellGenerateTest(unittest.TestCase): def setUp(self): self.hunspell = HunSpell("/usr/share/hunspell/en_GB.dic", "/usr/share/hunspell/en_GB.aff") def test_generate(self): self.assertEqual(self.hunspell.generate('boy', 'girls'), [b'boys']) def test_generate2(self): self.assertEqual(self.hunspell.generate2('boy', 'is:Ns'), [b'boys'])
def get_hunspell(prefix): try: from hunspell import HunSpell dic_fn = "{0}.dic".format(prefix) aff_fn = "{0}.dic".format(prefix) logging.info('loading hunspell dictionaries: {0} and {1}'.format( dic_fn, aff_fn)) return HunSpell(dic_fn, aff_fn) except ImportError: logging.warning('hunspell is not present, using cache file only$') return None
class Hunspell(object): name = 'hunspell' def __init__(self, nlp: Language, path: str, lang: str='en_US'): path = Path.cwd() / path if not any([nlp, isinstance(nlp, Language)]): raise ValueError('nlp must be of a spaCy Language.') from None if not path.exists(): raise NotADirectoryError('{} does not exist.'.format(path)) from None dic_path, aff_path = ( path / '{}.dic'.format(lang), path / '{}.aff'.format(lang), ) self.hobj = HunSpell(dic_path, aff_path) Token.set_extension('hunspell_spell', default=None) Token.set_extension('hunspell_suggest', getter=self.get_suggestion) def __call__(self, doc): for token in doc: try: token._.hunspell_spell = self.hobj.spell(token.text) except UnicodeEncodeError: pass return doc def get_suggestion(self, token): # TODO: include a lower option? # TODO: include suggestion numbers? # TODO: include stemmer? try: suggestions = self.hobj.suggest(token.text) except UnicodeEncodeError: suggestions = [] return suggestions
def stem_query(query): # returns list of stemmed words hunspell_object = HunSpell(DIC_FILE, AFF_FILE) stemmed_list = [] tokens = tokenize(query) for word in tokens: if word not in dummy_words: stemmed_list.append(stem(hunspell_object, word)) return stemmed_list
class spaCyHunSpell(object): name = 'hunspell' def __init__(self, nlp, path=HUNSPELL_PROFILE): if path in DEFAULT_DICTIONARY_PATHS: default_path = DEFAULT_DICTIONARY_PATHS[path] dic_path, aff_path = ( os.path.join(default_path, 'en_US.dic'), os.path.join(default_path, 'en_US.aff'), ) else: assert len(path) == 2, 'Include two paths: dic_path and aff_path' dic_path, aff_path = path self.hobj = HunSpell(dic_path, aff_path) Token.set_extension('hunspell_spell', default=None) Token.set_extension('hunspell_suggest', getter=self.get_suggestion) def __call__(self, doc): for token in doc: try: token._.hunspell_spell = self.hobj.spell(token.text) except UnicodeEncodeError: pass return doc def get_suggestion(self, token): # TODO: include a lower option? # TODO: include suggestion numbers? # TODO: include stemmer? try: suggestions = self.hobj.suggest(token.text) except UnicodeEncodeError: suggestions = [] return suggestions
def setUp(self): self.hunspell = HunSpell("/usr/share/hunspell/en_US.dic", "/usr/share/hunspell/en_US.aff")
class HunSpellTest(unittest.TestCase): def setUp(self): self.hunspell = HunSpell("/usr/share/hunspell/en_US.dic", "/usr/share/hunspell/en_US.aff") def tearDown(self): try: del self.hunspell except AttributeError: pass def test_hunspell_spell(self): self.assertFalse(self.hunspell.spell('dpg')) self.assertTrue(self.hunspell.spell('dog')) self.assertFalse(self.hunspell.spell('spookie')) self.assertTrue(self.hunspell.spell('spooky')) def test_hunspell_suggest(self): self.assertEqual(self.hunspell.suggest('dpg'), ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP', 'DP', 'PG', 'DTP', 'dip']) self.assertEqual(self.hunspell.suggest('spookie'), ['spookier', 'spookiness', 'spook', 'cookie', 'bookie', 'Spokane', 'spoken']) self.assertEqual(self.hunspell.suggest('Eelysa'), ['Elyssa', 'Elysees', 'Elysha', 'Elysia', 'Elissa', 'Elysée']) def test_hunspell_stem(self): self.assertEqual(self.hunspell.stem('dog'), [b'dog']) self.assertEqual(self.hunspell.stem('permanently'), [b'permanent']) self.assertEqual(self.hunspell.stem('linked'), [b'linked', b'link']) def test_analyze(self): self.assertEqual(self.hunspell.analyze('linked'), [b' st:linked', b' st:link fl:D']) def test_add_remove(self): self.assertFalse(self.hunspell.spell('pipo')) self.hunspell.add('pipo') self.assertTrue(self.hunspell.spell('pipo')) self.hunspell.remove('pipo') self.assertFalse(self.hunspell.spell('pipo')) def test_add_dic(self): self.assertFalse(self.hunspell.spell("dictionnaire")) try: self.hunspell.add_dic("/usr/share/hunspell/fr.dic") except HunSpellError: raise ValueError("/usr/share/hunspell/fr.dic is not installed. " "Please install hunspell-fr to validate this test.") self.assertTrue(self.hunspell.spell("dictionnaire"))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jan 22 16:28:51 2018 @author: Samuele Garda """ import logging import argparse from hunspell import HunSpell DISCARD = ['\n', 'Twitter / Account gesperr\n'] DICS = ['./en_US.dic', './en_US.aff'] SPELLER = HunSpell(*DICS) NO_SPELL = ['^','Z','L','M','!','Y','#','@','~','U','E',',','G','S'] logger = logging.getLogger(__name__) logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(module)s: %(message)s', level = 'INFO') def parse_arguments(): parser = argparse.ArgumentParser(description='Utility for pos tagging tweets via TweetNLP ark-tweet-nlp without using java. ONLY LINUX SUPPORTED. \ Strings discarded when loading tweets: `\n`,`Twitter / Account gesperr\n`') parser.add_argument('-r' , '--raw-tweets', help = 'Path where to file where raw tweets are stored') parser.add_argument('-p' , '--parsed-tweets', help = 'Path where to file where parsed tweets are stored') parser.add_argument('-o' , '--output', default = 'tweet_data_for_dp.txt', help = 'Out path to write to') return parser.parse_args()
model_dir: Optional[Text] = None, model_metadata: Optional["Metadata"] = None, cached_component: Optional["Component"] = None, **kwargs: Any ) -> "Component": """Load this component from file.""" if cached_component: return cached_component else: return cls(meta) try: from hunspell import HunSpell spell_checker = HunSpell('./pyhunspell/dictionaries/en-GB/index.dic','./pyhunspell/dictionaries/en-GB/index.aff') #Load Hunspell Dicionary and Affrims encoding = spell_checker.get_dic_encoding() #Gets the dictionary Encoding imported = True except ModuleNotFoundError: print("Cannot Import HunSpell") imported = False def add_correct_Words(words = []): """ Adds the grammatically correct words to Hunspell Dictionary Arguments: words = List of the words Returns: None """ if imported:
class HunSpelling: """ Use the hunspell tool to detect isolated non-word spelling errors and to suggest candidate corrections """ def __init__(self, dic_file, aff_file, extra_dic=None): """ Load the dictionary and affix files for spell checking. Allow adding an extra dictionary. """ io_utils.check_file_readable(dic_file) io_utils.check_file_readable(aff_file) self.hunspell = HunSpell(dic_file, aff_file) if extra_dic: io_utils.check_file_readable(extra_dic) self.hunspell.add_dic(extra_dic) def is_misspelled(self, word): """Check if given word is misspelled""" return not self.hunspell.spell(word) def add_word(self, word): """Add new word into hunspell's dictionary""" if word: self.hunspell.add(word) def add_words(self, words): """Add new words into hunspell's dictionary""" if not isinstance(words, list): return for word in words: self.add_word(word) def add_extra_dictionary(self, dic_file): """Add an extra dictionary to the current instance""" io_utils.check_file_readable(dic_file) self.hunspell.add_dic(dic_file) def remove_word(self, word): """Remove word from hunspell's dictionary""" self.hunspell.remove(word) def remove_words(self, words): """Remove words from hunspell's dictionary""" if not isinstance(words, list): return for word in words: self.remove_word(word) def get_suggestions(self, word): """Return correction suggestions""" suggestions = [] for sgt in self.hunspell.suggest(word): sgt = sgt.replace('-', ' ') if not sgt in suggestions: suggestions.append(sgt) return suggestions def correct(self, query, ignore=None, topn=None): """ Return top candidate corrections for given query. The ignore flag can allow ignoring certain words (e.g. named entities) """ if not isinstance(query, list): query = query.split() if ignore is None: ignore = [0] * len(query) solutions = [] for i, token in enumerate(query): if token.isalpha() \ and not self.hunspell.spell(token) \ and not ignore[i]: suggestions = self.get_suggestions(token) if suggestions: solutions.append(suggestions) else: solutions.append([token]) else: solutions.append([token]) # merge solutions candidates = [' '.join(sol) for sol in product(*solutions)] return candidates[:topn]
import math import pickle import os from collections import Counter from config import DIC_FILE, AFF_FILE from hunspell import HunSpell symbols = [',', '.', ';', '\'', '"', '{', '}', '[', ']', '(', ')', '?', ':', '*', '^', '-', '%', '\\', '/'] hunspell_object = HunSpell(DIC_FILE, AFF_FILE) class GFG: def __init__(self): print "Done" def stem(self, hunspell_object, word): stemmed_list = hunspell_object.stem(word) if len(stemmed_list) > 0: return str(stemmed_list[0]) def index(self, words_file): open_file = open(words_file, 'r') words_list =[] contents = open_file.readlines() for i in range(len(contents)): for s in symbols:
#!/usr/bin/env python3 from lazydata import track from hunspell import HunSpell h = HunSpell( track( './data/en.dic', 'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.dic' ), track( './data/en.aff', 'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.aff' )) print([x.encode(h.get_dic_encoding()) for x in h.suggest('hollo')])
def __init__(self, autocorrect=False, check_compound=False): self.autocorrect = autocorrect self.check_compound = check_compound self.masking_regexes = [(r'(?:van|aan|in)\s(?:der|den)\s\b\w+\b', 3), (re.escape(PLACEHOLDER) + r'\s(?:van|aan|in)\s(?:de|het|t)\s\b\w+\b', 4) ] self.email_regex = r'[^@\s]+@[^@\s]+\.[^@\s]+' self.email_placeholder = 'maskedemail' self.url_regex = r'(?:http:\/\/|https:\/\/)?(?:www\.)?[a-z]+\.[a-z]{2,5}[^\s]*' # Import blacklists blacklists_path = script_path + '/blacklists/' blacklists = [] # Loop over available blacklists for filename in os.listdir(blacklists_path): if filename.endswith(".txt"): # Open assuming we may be dealing with unicode with codecs.open(blacklists_path + filename, encoding='utf-8') as f: # Decode unicode into ASCII words = {unidecode(x.strip()) for x in f.readlines()} # Store current blacklist blacklists.append(words) else: continue # Combine all blacklists into one self.blacklist = set.union(*blacklists) # Regex-based whitelists self.whitelist_regexes = [ r'\d+gb$', r'\d+ghz$', r'\d+mb$', r'\d+mbit$', r'\d+gbit$', r'\d+(?:\.|,)?\d{0,2}euro?', r'v\d+$', r'0(?:8|9)00(?:[\s\-]{1}\d{4})?', u'\u20AC\d+(?:\.|,)?\d{0,2}', r'^\d$', r'^[a-zA-Z]{1,2}\d{1,3}$', r'\d{1,2}:\d{2}(?::\d{2})?', r'\d{1,2}-\d{1,2}(?:-\d{2,4})?' ] + [self.url_regex] self.protected_punctuation_regexes = [ r'0(?:8|9)00[\s\-]{1}\d{4}', u'\u20AC\d+(?:\.|,)?\d{0,2}', ] + [self.url_regex, self.email_regex] + self.whitelist_regexes self.protected_punctuation_regex = '(' + \ '|'.join(self.protected_punctuation_regexes) + ')' # Load in HunSpell files self.d = HunSpell(script_path + '/dict/Dutch.dic', script_path + '/dict/Dutch.aff') self.clean_d = HunSpell(script_path + '/dict/Dutch_clean.dic', script_path + '/dict/Dutch.aff') # Load in curated autocorrection list with open(script_path + '/spellcheck/autocorrect.csv') as f: reader = csv.reader(f, skipinitialspace=True) self.autocorrecter = dict(reader) # Import whitelists whitelists_path = script_path + '/whitelists/' whitelists = [] # Loop over available whitelists for filename in os.listdir(whitelists_path): if filename.endswith(".txt"): # Open assuming we may be dealing with unicode with codecs.open(whitelists_path + filename, encoding='utf-8') as f: # Decode unicode into ASCII words = {unidecode(x.strip()) for x in f.readlines()} # Store current whitelist whitelists.append(words) else: continue # Combine all whitelists into one self.whitelist = set.union(*whitelists) self.whitelist = self.whitelist.union(set(self.autocorrecter.keys())) # Specific domain words whitelist with codecs.open(whitelists_path + 'domainwords.txt', encoding='utf-8') as f: self.domain_whitelist = { unidecode(x.strip()) for x in f.readlines() }
class Deperson(): def __init__(self, autocorrect=False, check_compound=False): self.autocorrect = autocorrect self.check_compound = check_compound self.masking_regexes = [(r'(?:van|aan|in)\s(?:der|den)\s\b\w+\b', 3), (re.escape(PLACEHOLDER) + r'\s(?:van|aan|in)\s(?:de|het|t)\s\b\w+\b', 4) ] self.email_regex = r'[^@\s]+@[^@\s]+\.[^@\s]+' self.email_placeholder = 'maskedemail' self.url_regex = r'(?:http:\/\/|https:\/\/)?(?:www\.)?[a-z]+\.[a-z]{2,5}[^\s]*' # Import blacklists blacklists_path = script_path + '/blacklists/' blacklists = [] # Loop over available blacklists for filename in os.listdir(blacklists_path): if filename.endswith(".txt"): # Open assuming we may be dealing with unicode with codecs.open(blacklists_path + filename, encoding='utf-8') as f: # Decode unicode into ASCII words = {unidecode(x.strip()) for x in f.readlines()} # Store current blacklist blacklists.append(words) else: continue # Combine all blacklists into one self.blacklist = set.union(*blacklists) # Regex-based whitelists self.whitelist_regexes = [ r'\d+gb$', r'\d+ghz$', r'\d+mb$', r'\d+mbit$', r'\d+gbit$', r'\d+(?:\.|,)?\d{0,2}euro?', r'v\d+$', r'0(?:8|9)00(?:[\s\-]{1}\d{4})?', u'\u20AC\d+(?:\.|,)?\d{0,2}', r'^\d$', r'^[a-zA-Z]{1,2}\d{1,3}$', r'\d{1,2}:\d{2}(?::\d{2})?', r'\d{1,2}-\d{1,2}(?:-\d{2,4})?' ] + [self.url_regex] self.protected_punctuation_regexes = [ r'0(?:8|9)00[\s\-]{1}\d{4}', u'\u20AC\d+(?:\.|,)?\d{0,2}', ] + [self.url_regex, self.email_regex] + self.whitelist_regexes self.protected_punctuation_regex = '(' + \ '|'.join(self.protected_punctuation_regexes) + ')' # Load in HunSpell files self.d = HunSpell(script_path + '/dict/Dutch.dic', script_path + '/dict/Dutch.aff') self.clean_d = HunSpell(script_path + '/dict/Dutch_clean.dic', script_path + '/dict/Dutch.aff') # Load in curated autocorrection list with open(script_path + '/spellcheck/autocorrect.csv') as f: reader = csv.reader(f, skipinitialspace=True) self.autocorrecter = dict(reader) # Import whitelists whitelists_path = script_path + '/whitelists/' whitelists = [] # Loop over available whitelists for filename in os.listdir(whitelists_path): if filename.endswith(".txt"): # Open assuming we may be dealing with unicode with codecs.open(whitelists_path + filename, encoding='utf-8') as f: # Decode unicode into ASCII words = {unidecode(x.strip()) for x in f.readlines()} # Store current whitelist whitelists.append(words) else: continue # Combine all whitelists into one self.whitelist = set.union(*whitelists) self.whitelist = self.whitelist.union(set(self.autocorrecter.keys())) # Specific domain words whitelist with codecs.open(whitelists_path + 'domainwords.txt', encoding='utf-8') as f: self.domain_whitelist = { unidecode(x.strip()) for x in f.readlines() } def curated_autocorrect(self, word): """ Autocorrect a word based on the curated list of autocorrection sets. """ if word in self.autocorrecter: corr = self.autocorrecter[word] else: return word return corr def smart_suggest(self, word): """ Autocorrect a word. (This is computationally expensive, use sparingly.) Keyword arguments: word -- a (potentially misspelled) Dutch word Return value: Corrected word or, if no suggestion could be found, the original word. """ if word.isdigit(): return word # Set of all suggestions provided by HunSpell for this word; # insensitive to the capitalization of the first letter suggestions = set(self.d.suggest(word)).union( set(self.d.suggest(word.capitalize()))) # If we have no suggestions, just return the word if (len(suggestions) == 0): return word # Otherwise, we want to return the closest match, where 'closest' is # defined by the Ratcliff/Obershelp pattern matching algorithm. worset, max = {}, 0 for sugg in suggestions: # Create a temporary SequenceMatcher instance (provides the # matching algorithm) that operates case-insensitively tmp = difflib.SequenceMatcher(None, word.lower(), sugg.lower(), autojunk=False).ratio() # Store the suggestion with its score worset[tmp] = sugg # Keep track of the suggestion with max score if tmp > max: max = tmp # Return best match (case insensitive) return worset[max].lower() def remove_punctuation(self, s): """ Drop punctuation from a string """ # Punctuation but not the '|' character punct = string.punctuation.replace("|", "").replace("[", "").replace("]", "") # Translator that maps punctuation to spaces translator = str.maketrans(punct, ' ' * len(punct)) # Split out protected patterns to preserve them split = re.split(self.protected_punctuation_regex, s) # print(split) # Remove punctuation from even-index parts of the split cleaned_split = [ part.translate(translator) if i % 2 == 0 else part for i, part in enumerate(split) ] # Put string back together t = ''.join(cleaned_split) # Mask out e-mail addresses t = re.sub(self.email_regex, self.email_placeholder, t) # Translated string # t = s.translate(translator) # Remove duplicate whitespaces # t = ' '.join(t.split()) return t def apply_whitelist(self, text): """ Filter text using whitelist """ # Split text into words words = self.remove_punctuation(text.lower()).split() # Correct text based on curated autocorrecter if self.autocorrect: for word in words: if word in self.autocorrecter: idx = words.index(word) # Replacement may consist of more than one word, so insert # as a list words[idx:idx + 1] = self.curated_autocorrect(word).split() # Calculate filter based on whitelist(TODO paralellize?) filter = list(map(lambda word: (word, word in self.whitelist), words)) # Update filter based on whitelist regexes filter = [ (word, f) if f else (word, any(re.match(regex, word) for regex in self.whitelist_regexes)) for (word, f) in filter ] # Update filter based on compound words if self.check_compound: for i in range(len(filter)): (word, f) = filter[i] if f or word.isdigit(): continue else: if self.clean_d.spell(word): filter[i] = (word, True) # Update filter based on autocorrection # # This is currently too slow, do not use # if self.autocorrect: # for i in range(len(filter)): # (word, f) = filter[i] # if f or word.isdigit(): # continue # else: # corr = self.smart_suggest(word) # if corr in self.whitelist: # filter[i] = (corr, True) # Put text back together without filtered words return ' '.join([ word if f # or (self.check_compound and self.d.spell(word)) # or (self.autocorrect and self.smart_suggest(word) in # self.whitelist) else PLACEHOLDER for (word, f) in filter ]) def domain_guarded_replace(self, regex, text, repl): """ Replace based on regex, but guarding domain-specific words. """ # Loop over matches for regex for match in re.finditer(regex, text): # Pull out the matched substring g = match.group() # Replace it with placeholder only if none of the words in the # substring are in the domain whitelist if not any(word in self.domain_whitelist for word in g.split()): text = text.replace(g, repl) return text def apply_blacklist(self, text): """ Apply all defined regex masks and the blacklist """ # Copy text output = text[:] # Mask out blacklisted strings for bad_word in self.blacklist: num_ph = len(bad_word.split()) multi_ph = (PLACEHOLDER, ) * num_ph repl = ' '.join(multi_ph) output = output.replace(bad_word, repl) # Mask out based on regexes for (regex, num_ph) in self.masking_regexes: multi_ph = (PLACEHOLDER, ) * num_ph repl = ' '.join(multi_ph) output = self.domain_guarded_replace(regex, output, repl) return output def get_illegal_words(self, text): """ Return words masked by apply_whitelist() from the given text. """ # Get masked words cleaned = self.apply_blacklist(self.apply_whitelist(text)).split() # Original text original = self.remove_punctuation(text).split() # Pull out original words that were filtered out filtered = [ original[i] for i in range(len(original)) if cleaned[i] == PLACEHOLDER ] # Return as string return ' '.join(filtered)