def initSpellchecker(self): # TODO: disable spellchecker icon in case of not working enchant try: import enchant spellDictDir = settings.get('spellchecker:directory') if spellDictDir: if enchant.__ver_major__ >= 1 and enchant.__ver_minor__ >= 6: enchant.set_param("enchant.myspell.dictionary.path", spellDictDir) else: print("Your pyenchant version is to old. Please " \ "upgrade to version 1.6.0 or higher, if you want " \ "to use spellchecker.") return None spellLang = settings.get('spellchecker:lang') if spellLang in enchant.list_languages(): # enchant.dict_exists(spellLang) do now work for me on linux... self.dict = enchant.Dict(spellLang) else: # try dictionary based on the current locale try: self.dict = enchant.Dict() settings.set('spellchecker:lang', self.dict.tag) except: # we don not have working dictionary... return None if self.dict: self.usePWL(self.dict) except: print("can not start spellchecker!!!") import traceback traceback.print_exc() return None
def aspell(string): ''' если ok: False, [] если ошибка: True, aspell_sug ''' enchant.set_param("enchant.aspell.dictionary.path", "./aspell6-ru-0.99f7-1") if not enchant.dict_exists('ru_RU'): copyfile( './resources/aspell/ru_RU.dic', os.path.dirname(enchant.__file__).replace('\\', '/') + '/share/enchant/myspell/ru_RU.dic') copyfile( './resources/aspell/ru_RU.aff', os.path.dirname(enchant.__file__).replace('\\', '/') + '/share/enchant/myspell/ru_RU.aff') d = enchant.Dict("ru_RU") if d.check(string): # aspell says OK # print('aspell says OK') return False, [] else: # print('aspell says mistake and suggests:') aspell_sug = d.suggest(string) return True, aspell_sug
def initSpellchecker(self): # TODO: disable spellchecker icon in case of not working enchant try: import enchant spellDictDir = settings.get('spellchecker:directory') if spellDictDir: if enchant.__ver_major__ >= 1 and enchant.__ver_minor__ >= 6: enchant.set_param("enchant.myspell.dictionary.path", spellDictDir) else: print "Your pyenchant version is to old. Please " \ "upgrade to version 1.6.0 or higher, if you want " \ "to use spellchecker." return None spellLang = settings.get('spellchecker:lang') if enchant.dict_exists(spellLang): self.dict = enchant.Dict(spellLang) else: # try dictionary based on the current locale try: self.dict = enchant.Dict() settings.set('spellchecker:lang', self.dict.tag) except: # we don not have working dictionary... return None if self.dict: self.usePWL(self.dict) except: print "can not start spellchecker!!!" import traceback traceback.print_exc() return None
def get_spellchecker_languages(directory = None): """ Check if spellchecker is installed and provide list of languages """ try: import enchant if (directory): enchant.set_param("enchant.myspell.dictionary.path", directory) langs = enchant.list_languages() return sorted(langs) except: print "can not start spellchecker!!!" import traceback traceback.print_exc() return None
def get_spellchecker_languages(directory=None): """ Check if spellchecker is installed and provide list of languages """ try: import enchant if (directory): enchant.set_param("enchant.myspell.dictionary.path", directory) langs = enchant.list_languages() return sorted(langs) except: print "can not start spellchecker!!!" import traceback traceback.print_exc() return None
#!/usr/bin/env python # Purpose: find words in the word puzzle game "Alpha Omega" # Syntax: ./findword.py from itertools import permutations import enchant enchant.set_param("enchant.myspell.dictionary.path", "/opt/local/var/macports/software/aspell") #print enchant.list_dicts() d = enchant.Dict("en_US") # type in a series of letters print "Provide the input letters, please!" try: letters = raw_input() print "The input letters are: ", letters except ValueError: print "Not a series of letters" # find all combinations of letters and check if they are a word perms = [''.join(p) for p in permutations( letters )] newperms = set( perms ) i = 0 for elem in newperms: if d.check( elem ) == True: print "A possible word is: ", elem i = 1 if i == 0: print "Can't find a word with the letters."
def set_dictionary_path(cls, path): """Additional paths to find dictionaries""" enchant.set_param('enchant.myspell.dictionary.path', path) SpellChecker.languages = [(language, locales.code_to_name(language)) for language in enchant.list_languages()] SpellChecker._language_map = dict(SpellChecker.languages)
from rules import rules_back, context_rules import numpy as np from collections import Counter from nltk.metrics import edit_distance from shutil import copyfile # import matplotlib.pyplot as plt def disable_print(*args): pass def disable_pprint(*args): pass mystem = Mystem() enchant.set_param("enchant.aspell.dictionary.path", "./aspell6-ru-0.99f7-1") if not enchant.dict_exists('ru_RU'): copyfile('./resources/aspell/ru_RU.dic', os.path.dirname(enchant.__file__).replace('\\', '/') + '/share/enchant/myspell/ru_RU.dic') copyfile('./resources/aspell/ru_RU.aff', os.path.dirname(enchant.__file__).replace('\\', '/') + '/share/enchant/myspell/ru_RU.aff') # res = list(''.join(o) for o in product(*d)) # res.remove(low) def old_process(test): ''' в этой версии есть проверка частотности, aspell, hunspell, mystem ''' # загружаем частотник big_ru = {} with open('./Freq2011/freqrnc2011.csv') as rus:
from pymongo import Connection, DESCENDING from random import choice from nltk.tokenize import sent_tokenize,word_tokenize from nltk.stem import WordNetLemmatizer from nltk.stem.lancaster import LancasterStemmer from nltk.stem import SnowballStemmer from nltk.tokenize import RegexpTokenizer,sent_tokenize import enchant enchant.set_param("enchant.myspell.dictionary.path","/usr/lib/python2.6/site-packages/pyenchant-1.6.5-py2.6.egg/enchant/share/enchant/myspell") class MongoDBSequence: def __init__(self,host='localhost',port=27017,db='learner',collection='news'): self.conn=Connection(host,port) self.collection=self.conn[db][collection] self.data=self._get_data() def __len__(self): return self.collection.count() def _get_data(self): res=self.collection.find() for item in res: yield item def __iter__(self): return self def next(self): #f=lambda d:d.get(field)$ return self.data.next() class RandomSentence:
# This is based on # https://github.com/wiki-ai/editquality/blob/master/ipython/reverted_detection_demo.ipynb # and only looks at single diffs import json import sys import mwapi import bz2 import enchant enchant.set_param("enchant.myspell.dictionary.path", r"/data/project/kokolores/dicts/usr/share/myspell/dicts/") d = enchant.Dict("de-DE") print("Load dataset...") with bz2.open("../datasets/datasets/dewiki_10000.json.bz2", "rt") as datafile: dataset = json.load(datafile) length = len(dataset) print("Loaded dataset of length %d" % length) training_set = dataset[:int(length / 1.33)] testing_set = dataset[int(length / 1.33):] from revscoring.features import wikitext, revision_oriented, temporal from revscoring.languages import german features = [ # Catches long key mashes like kkkkkkkkkkkk wikitext.revision.diff.longest_repeated_char_added, # Measures the size of the change in added words wikitext.revision.diff.words_added,
print('[DEBUG] Cursor position loaded', data['curPos']) cursor.setPosition(data['curPos']) window.editor.setTextCursor(cursor) else: newFile() if data.get('cnt'): count = data['cnt'] if data.get('ast'): autosaveTime = data['ast'] if not data.get('w'): data['w'] = 700 data['h'] = 750 window.resize(data['w'], data['h']) window.editor.setFixedHeight(window.height() - 45) window.editor.setFixedWidth(window.width()) del data with open('style.qss') as f: window.editor.setStyleSheet(f.read()) en.set_param("enchant.hunspell.dictionary.path", '') ############################################################################################################################################## loadDicts() window.show() autosave = AutoSave() autosave.start() sys.exit(app.exec_())
""" Author: Kyle Martin Email: [email protected] Features to be used on texts """ import enchant import spacy import numpy as np nlp = spacy.load('de') # enchant setup enchant.set_param('enchant.myspell.dictionary.path', 'dict_de') spell_check = enchant.Dict('de_DE_frami') def spell_checker(doc): """ uses an open office german dictionary and the `enchant` library prints the incorrect words to the console. """ words = [token.text for token in doc if token.is_punct is False] correct_words = 0 incorrect_words = [] for word in words: if spell_check.check(word) is True: correct_words += 1 else: incorrect_words.append(word) if len(words) > 1:
def main(args=None): """Run as a standalone script. You can simulate running the script internally by just passing a list with arguments in args; these will be treated as though they were the actual command line arguments.""" #Determin name of current command if __name__ == "__main__": cmd_name = os.path.basename(sys.argv[0]) elif args is not None and len(args) > 0: cmd_name = "welshtools %s" % args[0] elif len(sys.argv) > 1: cmd_name = "welshtools %s" % os.path.basename(sys.argv[1]) else: cmd_name = __name__ #Parse Command Line Arguments usage = "Usage: %s [options] SOURCE_FILE DEST_FILE" % cmd_name epilog = ("Reads SOURCE_FILE line by line and writes a reformatted and " "filtered list to DEST_FILE. This is intended to be run on the " "frequency lists from the Cronfa Electroneg o Gymraeg and not " "guaranteed to work with differently formatted input files. " "Please note that the SOURCE_FILE must be converted to utf-8 " "before running the script or the script will fail. The output " "file is always written in utf-8.") parser = OptionParser(usage=usage, version="%s %s" % (cmd_name, shared.__version__), epilog=epilog) parser.add_option( "-f", "--format", dest="format", metavar="STR", help=("Format to use for output file. {WORD} is replaced " "with the word and {FREQ} with the frequency. This " "can include the control characters \\\\, \\r, \\n," " and \\t. Default: \"{WORD},{FREQ}\\n\""), default="{WORD},{FREQ}\\n") parser.add_option( "-s", "--strict", action="store_true", dest="strict", help=("If --strict is specified, the script will not only" " exclude words which are found in the English " "dictionary or contain non-Welsh orthographic " "characters, but will also strip acute accents and " "remove contractions (e.g. 'r), and words which are" " hyphenated or contain j or J.")) parser.add_option( "-S", "--summary", action="store_true", dest="print_summary", help=("Print a summary of how many " "entries were read, written and excluded at the end " "of the script. Note that this ignores --quiet.")) parser.add_option( "-q", "--quiet", action="store_true", dest="quiet", help="Supress all command line output except for errors.") #Parse arguments if args is None: (opts, args) = parser.parse_args() else: (opts, args) = parser.parse_args(args) if len(args) != 2: print("Error: This command requires two arguments. Try `%s --help'." % cmd_name) return errno.EINVAL if opts.quiet: opts.verbose = False else: opts.verbose = True if opts.verbose: print("Opening source and destination files... ", end="") try: fin = codecs.open(args[0], "r", "utf8") fin_size = os.path.getsize(args[0]) except IOError as ex: print( "\nError: Could not open SOURCE_FILE (" + args[0] + ") for reading:", ex) return errno.EIO try: fout = codecs.open(args[1], "w+", "utf8") except IOError as ex: print( "\nError Could not open DEST_FILE (" + args[1] + ") for writing:", ex) return errno.EIO if opts.verbose: print("Done.") #Load dictionaries if opts.verbose: print("Loading Enchant dictionaries for en_US, en_GB and cy_GB... ", end="") try: enchant.set_param("enchant.myspell.dictionary.path", "./geiriadur-cy/dictionaries") d_us = enchant.Dict("en_US") d_gb = enchant.Dict("en_GB") d_cy = enchant.Dict("cy_GB") except Exception as ex: #pylint: disable=broad-except print( "\nError: Could not open Enchant dictionaries (en_US, en_GB, cy_GB):", ex) return errno.ENOPKG if opts.verbose: print("Done.") #Set string of allowed characters welsh_chrs_strict = set('ABCDEFGHILMNOPRSTUWYabcdefghilmnoprstuwy\\/+%') welsh_chrs_all = set( ('ABCDEFGHIJLMNOPRSTUWYabcdefghijlmnoprstuwy' 'ÄËÏÖÜẄŸäëïöüẅÿÂÊÎÔÛŴŶâêîôûŵŷÁÉÍÓÚẂÝáéíóúẃýÀÈÌÒÙẀỲàèìòùẁỳ' '\'-')) #Set mappings from CEG transcription to UTF8 if opts.strict: if opts.verbose: print("Mapping mode: strict.") #Strip /, map % onto ¨, map \ onto `, and map + onto ^ mapping = { '/': '', 'a%': 'ä', 'e%': 'ë', 'i%': 'ï', 'o%': 'ö', 'u%': 'ü', 'y%': 'ÿ', 'w%': 'ẅ', 'A%': 'Ä', 'E%': 'Ë', 'I%': 'Ï', 'O%': 'Ö', 'U%': 'Ü', 'Y%': 'Ŷ', 'W%': 'Ẅ', 'a\\': 'à', 'e\\': 'è', 'i\\': 'ì', 'o\\': 'ò', 'u\\': 'ù', 'y\\': 'ỳ', 'w\\': 'ẁ', 'A\\': 'À', 'E\\': 'È', 'I\\': 'Ì', 'O\\': 'Ò', 'U\\': 'Ù', 'Y\\': 'Ỳ', 'W\\': 'Ẁ', 'a+': 'â', 'e+': 'ê', 'i+': 'î', 'o+': 'ô', 'u+': 'û', 'y+': 'ŷ', 'w+': 'ŵ', 'A+': 'Â', 'E+': 'Ê ', 'I+': 'Î', 'O+': 'Ô', 'U+': 'Û', 'Y+': 'Ŷ', 'W+': 'Ŵ' } else: if opts.verbose: print("Mapping mode: relaxed.") #Map / onto ´, map % onto ¨, map \ onto `, and map + onto ^ mapping = { 'a/': 'á', 'e/': 'é', 'i/': 'í', 'o/': 'ó', 'u/': 'ú', 'y/': 'ý', 'w/': 'ẃ', 'A/': 'Á', 'E/': 'É', 'I/': 'Í', 'O/': 'Ó', 'U/': 'Ú', 'Y/': 'Ý', 'W/': 'Ẃ', 'a%': 'ä', 'e%': 'ë', 'i%': 'ï', 'o%': 'ö', 'u%': 'ü', 'y%': 'ÿ', 'w%': 'ẅ', 'A%': 'Ä', 'E%': 'Ë', 'I%': 'Ï', 'O%': 'Ö', 'U%': 'Ü', 'Y%': 'Ŷ', 'W%': 'Ẅ', 'a\\': 'à', 'e\\': 'è', 'i\\': 'ì', 'o\\': 'ò', 'u\\': 'ù', 'y\\': 'ỳ', 'w\\': 'ẁ', 'A\\': 'À', 'E\\': 'È', 'I\\': 'Ì', 'O\\': 'Ò', 'U\\': 'Ù', 'Y\\': 'Ỳ', 'W\\': 'Ẁ', 'a+': 'â', 'e+': 'ê', 'i+': 'î', 'o+': 'ô', 'u+': 'û', 'y+': 'ŷ', 'w+': 'ŵ', 'A+': 'Â', 'E+': 'Ê ', 'I+': 'Î', 'O+': 'Ô', 'U+': 'Û', 'Y+': 'Ŷ', 'W+': 'Ŵ' } #Parse format string if opts.verbose: print("Format string:", '"' + opts.format + '".') format_mappings = {'\\\\': '\\', '\\r': "\r", '\\n': "\n", '\\t': "\t"} for k, v in format_mappings.items(): opts.format = opts.format.replace(k, v) #Process files if opts.verbose: print("Processing word list...") shared.progress(0, fin_size) count_inlines = 0 count_outlines = 0 for line in fin: count_inlines += 1 #Split line into freq and word (freq, word) = line.strip().split("\t") #IF STRICT: Skip words with hyphens and non-Welsh characters before mapping if opts.strict and not set(word).issubset(welsh_chrs_strict): continue #Map CEG transcriptions onto UTF8 characters. for k, v in mapping.items(): word = word.replace(k, v) #Skip words which have non-Welsh characters after mapping if not set(word).issubset(welsh_chrs_all): continue #Skip words which are more than one chr and in the English dictionaries #unless they are in the Welsh dictionary if not d_cy.check(word): if len(word) > 1 and (d_us.check(word) or d_gb.check(word)): continue #Format word formatted = opts.format.format(WORD=word, FREQ=freq) #Write to output file count_outlines += 1 fout.write(formatted) #Show progress if opts.verbose: shared.progress(fin.tell(), fin_size) if opts.verbose: print("\nDone.") if opts.print_summary: print("Summary:") print(" Entries in Source: %s" % count_inlines) print(" Entries in Output: %s" % count_outlines) print(" Excluded Entries: %s" % (count_inlines - count_outlines)) #Close input and output files fin.close() fout.close() #Return clean exit code return 0
def main(args): with open(args.corpus_bigrams) as json_file: bigrams = json.load(json_file) # Define additional info to add to output path suffix = "-tok" sp_str = "-no_sp" if args.disable_spell_check else "" bigram_str = "-bi" if args.bigrams else "" lower_str = "-lower" if args.lower else "" lemma_str = "-lemma" if args.lemma else "" street_str = "-streets" if args.street_sub else "" stop_str = "-tng" if args.disable_stopwords else "" # tng = topical ngrams suffix += sp_str + bigram_str + lower_str + lemma_str + street_str + stop_str if not args.output_dir_base: base = args.corpus_dir if not args.filepath else os.path.dirname(args.filepath) output_dir = base.rstrip("/") + suffix else: output_dir = args.output_dir_base.rstrip("/") + suffix # Create output directory if not args.tsv_corpus and not os.path.exists(output_dir): os.mkdir(output_dir) if not args.filepath and not args.tsv_corpus: print(timestamp() + " Tokenizing data to", suffix, file=sys.stderr) enchant.set_param("enchant.myspell.dictionary.path", args.myspell_path) gb = enchant.DictWithPWL("en_GB") #, args.pwl_path) # GB isn't working, doesn't recognize 'entrancei' as "entrance i" gb_and_pwl = enchant.DictWithPWL("en_GB", args.pwl_path) # GB isn't working, doesn't recognize 'entrancei' as "entrance i" # If processing one file, don't loop! if args.filepath: if not os.path.splitext(args.filepath)[1] == ".txt": print(timestamp() + " Must input text file. Exiting...", file=sys.stderr) exit(0) output_file = os.path.join(output_dir, os.path.basename(args.filepath)) if not args.overwrite and os.path.exists(output_file): exit(0) # Tokenize single file output = tokenize_file(args, args.filepath, gb, gb_and_pwl, bigrams) # Merge words if flag is set to true if args.merge_words: # Create dictionary (personal word list) out of unigrams pwl = enchant.request_pwl_dict(args.pwl_path) for i,line in enumerate(output): output[i] = " ".join(merge_words(args, pwl, line.split(), bigrams)) if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) # Write output to new file with open(output_file, "w") as f: f.write("\n".join(output)) exit(0) else: if args.tsv_corpus: output_file = args.tsv_corpus[:-4] + suffix + ".tsv" if not args.overwrite and os.path.exists(output_file): print("File", output_file, "exists. Exiting...") exit(0) with open(args.tsv_corpus, 'r') as f: docs = f.read().split("\n") if docs[0].lower() == "id\tyear\ttext": idx = 1 tsv_out = [docs[0]] else: idx = 0 tsv_out = [] for doc in docs[idx:]: try: id, year, text = doc.split("\t") except ValueError: continue tokenized = tokenize_line(args, text, gb, gb_and_pwl, bigrams) tsv_out.append(id + "\t" + year + "\t" + tokenized) with open(output_file, "w") as f: f.write('\n'.join(tsv_out)) else: # Compile list of files to tokenize files = [os.path.join(args.corpus_dir, f) for f in os.listdir(args.corpus_dir) if (os.path.isfile(os.path.join(args.corpus_dir, f)) and f.endswith('.txt'))] for i in tqdm(range(len(files))): file = files[i] # Define path for new tokenized file output_file = os.path.join(output_dir, os.path.basename(file)) if not args.overwrite and os.path.exists(output_file): continue # Tokenize single file output = tokenize_file(args, file, gb, gb_and_pwl, bigrams) # Write output to new file with open(output_file, "w") as f: f.write('\n'.join(output))
if data['curPos']: print('[DEBUG] Cursor position loaded', data['curPos']) cursor.setPosition(data['curPos']) window.editor.setTextCursor(cursor) else: newFile() if data.get('cnt'): count = data['cnt'] if data.get('ast'): autosaveTime = data['ast'] if not data.get('w'): data['w'] = 700; data['h'] = 750 window.resize(data['w'], data['h']) window.editor.setFixedHeight(window.height()-45) window.editor.setFixedWidth(window.width()) del data with open('style.qss') as f: window.editor.setStyleSheet(f.read()) en.set_param("enchant.hunspell.dictionary.path", '') ############################################################################################################################################## loadDicts() window.show() autosave = AutoSave() autosave.start() sys.exit(app.exec_())
enchant_version = 'enchant {}'.format(enchant.__version__) except ImportError: enchant = None enchant_version = None if sys.platform == 'win32x': # reset sys.platform sys.platform = 'win32' # using PyGObject's copy of libenchant means it won't find the # dictionaries installed with PyEnchant if enchant: for name in site.getsitepackages(): dict_path = os.path.join(name, 'enchant', 'share', 'enchant', 'myspell') if os.path.isdir(dict_path): enchant.set_param('enchant.myspell.dictionary.path', dict_path) break from photini.pyqt import Qt, QtCore, QtGui, QtWidgets class SpellCheck(QtCore.QObject): new_dict = QtCore.pyqtSignal() def __init__(self, *arg, **kw): super(SpellCheck, self).__init__(*arg, **kw) self.config_store = QtWidgets.QApplication.instance().config_store self.enable(eval(self.config_store.get('spelling', 'enabled', 'True'))) self.set_dict(self.config_store.get('spelling', 'language')) @staticmethod
def prepare_environment(): # Use locally installed dictionaries enchant.set_param("enchant.myspell.dictionary.path", r"/data/project/kokolores/dicts/usr/share/myspell/dicts/")
import string import enchant import nltk import oce.logger logger = oce.logger.getLogger(__name__) # === Config === from oce.config import sge_words from oce.config import sge_chinese_derived_words, sge_malay_derived_words from oce.langid.constants import valid_pinyin # === Spellcheckers === enchant.set_param("enchant.myspell.dictionary.path", "./lib/dict") # --- Languages and minor variants --- spelling_languages = { "en": ["en_US-large", "en_GB-large"], "ms": ["ms_MY"], "sge": [], "zh": [] # "sge" and "zh" handled with personal word lists below } # --- Corresponding dictionaries --- spelling_dictionaries = {} for language in spelling_languages.keys(): spelling_dictionaries[language] = {} for variant in spelling_languages[language]: spelling_dictionaries[language][variant] = enchant.Dict(variant) # --- SgE word lists ---
enchant_version = 'enchant {}'.format(enchant.__version__) except ImportError: enchant = None enchant_version = None if sys.platform == 'win32x': # reset sys.platform sys.platform = 'win32' # using PyGObject's copy of libenchant means it won't find the # dictionaries installed with PyEnchant if enchant: for name in site.getsitepackages(): dict_path = os.path.join( name, 'enchant', 'share', 'enchant', 'myspell') if os.path.isdir(dict_path): enchant.set_param('enchant.myspell.dictionary.path', dict_path) break from photini.pyqt import Qt, QtCore, QtGui, QtWidgets class SpellCheck(QtCore.QObject): new_dict = QtCore.pyqtSignal() def __init__(self, *arg, **kw): super(SpellCheck, self).__init__(*arg, **kw) self.config_store = QtWidgets.QApplication.instance().config_store self.enable(eval(self.config_store.get('spelling', 'enabled', 'True'))) self.set_dict(self.config_store.get('spelling', 'language')) @staticmethod