def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content
def make_checker(): ''' creates a checker depending on the system running :return: Hunspell object h ''' if platform.system() == 'Windows': h = Hunspell('de_DE_frami', hunspell_data_dir="C:\\Users\\Lena_Langholf\\Dropbox\\Spell_Checking\\dictionaries") else: h = Hunspell('de_DE_frami', hunspell_data_dir="/home/lena/Desktop/million_post_corpus/dictionaries") return h
def test_bad_path_encoding(self, *mocks): if PY3: with self.assertRaises(HunspellFilePathError): Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') else: # Python 2 just make an illegal string instead of raising with captured_c_stderr_file() as caperr: Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') with open(caperr, 'r') as err: self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding')
def spell_corrector(df, lang1, lang2): #Create an object of the Hunspell class h = Hunspell() print('I am spell_checker') #An empty list to hold the corrected sentences which would later be made into a dataframe corr_sent_list = {'L1': [], 'L2': []} #For each sentence in the dataframe for sent in df['L1']: #Empty string to which the corrected words are appended corr_sent = '' #For every word in the sentence. Which is split by word boundary for w in re.split(r'\b', sent): #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent if not w.isalpha() or h.spell(w): corr_sent += w #If the split part is word and is incorrect else: #Suggest possible correct candidates to the incorrect word suggest = h.suggest(w) #If more than one word is suggested, more processing is required to select a word if len(suggest) > 1: #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word corr_sent += suggest[0] #If only one word is suggested, append it to corr_sent else: corr_sent += suggest[0] #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list corr_sent_list['L1'].append(corr_sent) #Convert the corrected sentences list into pandas dataframe to return if lang2 is not None: corr_sent_list['L2'].extend(list(df['L2'])) return pd.DataFrame.from_dict(corr_sent_list) else: return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])
def loadResources(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) # Language model built by KenLM: https://github.com/kpu/kenlm lm = kenlm.Model(args.model) # Load spaCy nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners det = {"", "the", "a", "an"} # List of common prepositions prep = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } # Save the above in a dictionary: res_dict = { "lm": lm, "nlp": nlp, "gb": gb, "gb_infl": gb_infl, "det": det, "prep": prep } return res_dict
def test_windows_utf_8_encoding_applies_prefix(self, *mocks): with captured_c_stderr_file() as caperr: with patch("os.name", 'nt'): # If python file existance checks used prefix, this would raise a HunspellFilePathError Hunspell('test', system_encoding='UTF-8') with open(caperr, 'r') as err: # But the Hunspell library lookup had the prefix applied self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*')
def __init__(self, dialect, script): self.dialect = dialect self.script = script self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"} if self.dialect == "Sorani" and self.script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: if not (self.dialect == "Kurmanji" and self.script == "Latin"): raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")
def __init__(self, language='en_US', hunspell_data_dir='./hunspell', n_jobs=1): SpellChecker.get_dict(language, hunspell_data_dir) self.hunspell = Hunspell(language, hunspell_data_dir=hunspell_data_dir, disk_cache_dir=os.path.join( hunspell_data_dir, 'cache')) self.hunspell.set_concurrency(n_jobs) self.substitutes = dict()
def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem)
def run(self) -> Generator[Tuple[int, int, str, type], None, None]: """Run the linter and return a generator of errors.""" with open(self.filename, 'r') as file: comments = get_comments(file.read()) # for comment in comments z = list(comments) spell = Hunspell() x = spell.spell(z[1][2][0]) print(x) yield (0, 0, f'KOL001 Bad language found: ', TypoChecker)
def new_hunspell_nl() -> Hunspell: dictionary_path = __resolve_path("../dict/") hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path)) # add words that are not present in current dictionary for list in [get_plural_nouns(), get_basic_words()]: for word in list: if not hnspl.spell(word): hnspl.add(word) return hnspl
def hunspell(self) -> Hunspell: """ Returns the (cached) Hunspell instance """ if not self._hunspell: self._hunspell = Hunspell( self.lang.get_hunspell_dict_name(), hunspell_data_dir=self.hunspell_data_dir, ) return self._hunspell
def test_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertEqual(self.h.suggest('made-up'), test_suggest) self.assertEqual(self.h.stem('made-up'), test_stem)
def __init__( self, lang='en', max_dist=2, cpu=os.cpu_count(), # cache_manager="hunspell",disk_cache_dir=None, # hunspell_data_dir=None,system_encoding=None spell_kwargs={}): self.lang = self.langs.get(lang, lang) self.spell_dict = Hunspell(self.lang, **spell_kwargs) self.max_dist = max_dist self.spell_dict.set_concurrency(cpu)
def test_clear_caches_persistance(hunspell): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_suffix = h1.suffix_suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest assert h1.suggest('made-up') == test_suggest h1._suffix_cache['made-up'] = test_suffix assert h1.suffix_suggest('made-up') == test_suffix h1._stem_cache['made-up'] = test_stem assert h1.stem('made-up') == test_stem h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() assert len(cacheman.cache_by_name) == 0 h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') assert len(h2._suggest_cache) == 0 assert len(h2._stem_cache) == 0 assert h2.suggest('made-up') != test_suggest assert h2.suffix_suggest('made-up') != test_suffix assert h2.stem('made-up') != test_stem finally: shutil.rmtree(temp_dir) # Nuke temp content
def __init__(self): """Constructor.""" super().__init__() self.__treebank_tokenizer = TreebankWordTokenizer() hunspell_dict_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'hindi-hunspell', 'dict-hi_IN', ) if not os.path.isdir(hunspell_dict_dir): raise McLanguageException( "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')): raise McLanguageException( "Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')): raise McLanguageException( "Hunspell affix file does not exist at path: %s" % hunspell_dict_dir) try: self.__hindi_hunspell = Hunspell( lang='hi_IN', hunspell_data_dir=hunspell_dict_dir) except Exception as ex: raise McLanguageException( "Unable to initialize Hunspell with data directory '%s': %s" % ( hunspell_dict_dir, str(ex), )) # Quick self-test to make sure that Hunspell is installed and dictionary is available hunspell_exc_message = """ Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g. you might need to fetch Git submodules by running: git submodule update --init --recursive """ try: test_stems = self.stem_words(['गुरुओं']) except Exception as _: raise McLanguageException(hunspell_exc_message) else: if len(test_stems) == 0 or test_stems[0] != 'गुरु': raise McLanguageException(hunspell_exc_message)
def test_clear_caches_non_peristance(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) self.h.clear_cache() del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertNotEqual(self.h.suggest('made-up'), test_suggest) self.assertNotEqual(self.h.stem('made-up'), test_stem)
def test_non_overlapping_caches(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) assert h2.suggest('made-up') != test_suggest assert h2.stem('made-up') != test_stem
def test_hunspell_bulk_stem(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertDictEqual(d.bulk_action("stem", ['dog', 'permanently']), { 'permanently': ['permanent'], 'dog': ['dog'] }) self.assertDictEqual( d.bulk_action("stem", ['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ['recorded'], 'permanently': ['permanent'], 'twigs': ['twig'], 'dog': ['dog'] }) del d
def test_clear_caches_non_peristance(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem hunspell.clear_cache() del hunspell hunspell = Hunspell('test', hunspell_data_dir=DICT_DIR) assert hunspell.suggest('made-up') != test_suggest assert hunspell.suffix_suggest('made-up') != test_suffix assert hunspell.stem('made-up') != test_stem
def test_hunspell_bulk_suggest(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertDictEqual( d.bulk_action("suggest", ['dog', 'dpg']), { 'dpg': [ 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP' ], 'dog': ['dog'] }) self.assertDictEqual( d.bulk_action("suggest", [ 'dog', 'dpg', 'pgg', 'opg', 'dyg', 'frg', 'twg', 'bjn', 'foo', 'qre' ]), { 'pgg': ['pg', 'peg', 'egg', 'pig', 'pug', 'pkg', 'pg g', 'PG'], 'foo': [ 'few', 'goo', 'fop', 'foot', 'fool', 'food', 'foe', 'for', 'fro', 'too', 'fol', 'coo', 'fog', 'moo', 'fob' ], 'frg': [ 'fr', 'frig', 'frog', 'erg', 'fig', 'f*g', 'fro', 'fog', 'fry', 'fr g' ], 'twg': ['twig', 'tag', 'two', 'tog', 'tug', 'twp'], 'bjn': ['bin', 'ban', 'bun', 'Bjorn'], 'dog': ['dog'], 'dpg': [ 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP' ], 'opg': [ 'op', 'pg', 'ope', 'ops', 'opt', 'mpg', 'opp', 'o pg', 'op g', 'GPO' ], 'dyg': ['dug', 'dye', 'deg', 'dig', 'dog', 'dying'], 'qre': [ 'qr', 're', 'ere', 'ire', 'are', 'ore', 'Ore', 'Dre', 'q re', 'qr e' ] }) del d
def leetScan(string, valDict, language="EN"): leetcandidates = [] count = 0 h = Hunspell('en_US', hunspell_data_dir='/Library/Spelling') tokens = nltk.word_tokenize(string) # Calculate Total Words in string total_words = len(tokens) for token in tokens: # Check for misspelling if h.spell(token) == False: # See if word contains leet if leetCheck(token): # Add to possible candidate list leetcandidates.append(token) # Test candidate list for word validity using swapping for candidate in leetcandidates: if swapValid(candidate, valDict, h): count = count + 1 fraction = Fraction(count, total_words) return fraction
def __init__(self, threshold=0.96): basename = os.path.dirname(os.path.realpath(__file__)) self.lm = LanguageModel() # Load spaCy self.nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. self.gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ self.gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners self.determiners = {"", "the", "a", "an"} # List of common prepositions self.prepositions = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } self.threshold = threshold
from hunspell import Hunspell from unidecode import unidecode from nltk.tokenize import sent_tokenize, word_tokenize from nltk.metrics.distance import edit_distance import string import zipfile import re from langdetect import detect from functools import reduce hunsp = Hunspell('Spanish') table = str.maketrans('', '', string.punctuation) def is_not_spanish_line(line): try: return detect(line) != 'es' except: return True def correct_hyphen(lines): for line_index, line in enumerate(lines): if line != []: last_word = line[-1].strip() last_char = last_word[-1] if last_char == '-' and line_index+1 < len(lines): next_line = lines[line_index+1] if len(next_line) > 0: first_word_next_line = next_line[0] first_word_clean_next_line = first_word_next_line.translate(table) #if hunsp.spell(last_word[:-1]+first_word_clean_next_line): if True:
def __init__(self): self.hs = Hunspell('en_US') self.vs = cv2.VideoCapture(0) self.current_image = None self.current_image2 = None self.json_file = open("Models\model_new.json", "r") self.model_json = self.json_file.read() self.json_file.close() self.loaded_model = model_from_json(self.model_json) self.loaded_model.load_weights("Models\model_new.h5") self.json_file_dru = open("Models\model-bw_dru.json", "r") self.model_json_dru = self.json_file_dru.read() self.json_file_dru.close() self.loaded_model_dru = model_from_json(self.model_json_dru) self.loaded_model_dru.load_weights("Models\model-bw_dru.h5") self.json_file_tkdi = open("Models\model-bw_tkdi.json", "r") self.model_json_tkdi = self.json_file_tkdi.read() self.json_file_tkdi.close() self.loaded_model_tkdi = model_from_json(self.model_json_tkdi) self.loaded_model_tkdi.load_weights("Models\model-bw_tkdi.h5") self.json_file_smn = open("Models\model-bw_smn.json", "r") self.model_json_smn = self.json_file_smn.read() self.json_file_smn.close() self.loaded_model_smn = model_from_json(self.model_json_smn) self.loaded_model_smn.load_weights("Models\model-bw_smn.h5") self.ct = {} self.ct['blank'] = 0 self.blank_flag = 0 for i in ascii_uppercase: self.ct[i] = 0 print("Loaded model from disk") self.root = tk.Tk() self.root.title("Sign Language To Text Conversion") self.root.protocol('WM_DELETE_WINDOW', self.destructor) self.root.geometry("900x900") self.panel = tk.Label(self.root) self.panel.place(x=100, y=10, width=580, height=580) self.panel2 = tk.Label(self.root) # initialize image panel self.panel2.place(x=400, y=65, width=275, height=275) self.T = tk.Label(self.root) self.T.place(x=60, y=5) self.T.config(text="Sign Language To Text Conversion", font=("Courier", 30, "bold")) self.panel3 = tk.Label(self.root) # Current Symbol self.panel3.place(x=500, y=540) self.T1 = tk.Label(self.root) self.T1.place(x=10, y=540) self.T1.config(text="Character :", font=("Courier", 30, "bold")) self.panel4 = tk.Label(self.root) # Word self.panel4.place(x=220, y=595) self.T2 = tk.Label(self.root) self.T2.place(x=10, y=595) self.T2.config(text="Word :", font=("Courier", 30, "bold")) self.panel5 = tk.Label(self.root) # Sentence self.panel5.place(x=350, y=645) self.T3 = tk.Label(self.root) self.T3.place(x=10, y=645) self.T3.config(text="Sentence :", font=("Courier", 30, "bold")) self.T4 = tk.Label(self.root) self.T4.place(x=250, y=690) self.T4.config(text="Suggestions :", fg="red", font=("Courier", 30, "bold")) self.bt1 = tk.Button(self.root, command=self.action1, height=0, width=0) self.bt1.place(x=26, y=745) self.bt2 = tk.Button(self.root, command=self.action2, height=0, width=0) self.bt2.place(x=325, y=745) self.bt3 = tk.Button(self.root, command=self.action3, height=0, width=0) self.bt3.place(x=625, y=745) self.str = "" self.word = " " self.current_symbol = "Empty" self.photo = "Empty" self.video_loop()
import spacy nlp = spacy.load('en_core_web_md', parsed=False) from hunspell import Hunspell h = Hunspell() import numpy as np from scipy import spatial from sklearn.cluster import KMeans from sklearn.cluster import AffinityPropagation from PIL import ImageFont, ImageDraw import networkx as nx from fa2 import ForceAtlas2 import matplotlib.pyplot as plt from word import Word class PreProcessing(): #words are user input per line (e.g. keyword1) def __init__(self, stop_words_file = 'stop_words.txt', negation_stop_words_file = 'negation_stop_words.txt', max_font_size=120, min_font_size=30, min_characters=3, min_words_cluster=300, font="Verdana"): self.corpus = [] #raw data that need to be treated for individual submissions self.font = font self.words = [] # words of each iteration self.stop_words = [w.strip('\n').lower() for w in open(stop_words_file).readlines()] self.neg_stop_words = [w.strip('\n').lower() for w in open(negation_stop_words_file).readlines()] self.max_font_size = max_font_size self.min_font_size = min_font_size self.min_characters = min_characters self.entites_freq = {} # somewhat temp?
from hunspell import Hunspell h = Hunspell("ko", hunspell_data_dir='ko') if __name__ == "__main__": answer = h.spell("안녕하세요으") print(answer) answer2 = h.spell("안녕하세") print(answer2) answer3 = h.suggest("안녕하세요으") print(answer3)
# coding=utf-8 import locale from datetime import datetime import pytz from rasa_core_sdk import Action, Tracker from rasa_core_sdk.events import SlotSet, UserUtteranceReverted from pymongo import MongoClient from rasa_core_sdk.forms import FormAction, EntityFormField, FormField from Database import Database from hunspell import Hunspell hsp = Hunspell('es_ANY') # collection = Database('kb', 'Calendarios').collection # Actions Eventos class ActionLogIn(FormAction): RANDOMIZE = False @staticmethod def required_fields(): return [ EntityFormField("matricula", "matricula"), EntityFormField("password", "password"), ] def name(self): return 'action_login_form' def submit(self, dispatcher, tracker, domain):
def __init__(self): self.checker = Hunspell()
def refresh_dict(self): """ Create a new Hunspell object from the specified dictionary file. """ self.hunspell = Hunspell('index', hunspell_data_dir=self.dictionary_directory)