def find_missing_dict(lang): missing = [] try: enchant.request_dict(lang['aspell']) except: missing.append(('Dictionary', '(none)', { 'debian': ('aspell-%s' % lang['aspell']), 'fedora': ('aspell-%s' % lang['aspell']), 'gentoo': ('aspell-%s' % lang['aspell']), 'linuxmint': ('aspell-%s' % lang['aspell']), 'ubuntu': ('aspell-%s' % lang['aspell']), })) return missing
def init(self, *args, **kwargs): super(Spell, self).init(*args, **kwargs) self.language = DEFAULT_LANGUAGE self.dictionary = request_dict(self.language) Commands().register(self)
def find_missing_dict(lang): if os.name == "nt": return [] import enchant missing = [] try: enchant.request_dict(lang['aspell']) except: # noqa: E722 missing.append(('Dictionary', '(none)', { 'debian': ('aspell-%s' % lang['aspell']), 'fedora': ('aspell-%s' % lang['aspell']), 'gentoo': ('aspell-%s' % lang['aspell']), 'linuxmint': ('aspell-%s' % lang['aspell']), 'ubuntu': ('aspell-%s' % lang['aspell']), })) return missing
def check_spelling(spelling_lang, txt): """ Check the spelling in the text, and compute a score. The score is the number of words correctly (or almost correctly) spelled, minus the number of mispelled words. Words "almost" correct remains neutral (-> are not included in the score) Returns: A tuple : (fixed text, score) """ _ENCHANT_LOCK.acquire() try: # Maximum distance from the first suggestion from python-enchant words_dict = enchant.request_dict(spelling_lang) try: tknzr = enchant.tokenize.get_tokenizer(spelling_lang) except enchant.tokenize.TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tknzr = enchant.tokenize.get_tokenizer() score = 0 offset = 0 for (word, word_pos) in tknzr(txt): if len(word) < _MIN_WORD_LEN: continue if words_dict.check(word): # immediately correct words are a really good hint for # orientation score += 100 continue suggestions = words_dict.suggest(word) if len(suggestions) <= 0: # this word is useless. It may even indicates a bad orientation score -= 10 continue main_suggestion = suggestions[0] lv_dist = nltk.metrics.distance.edit_distance(word, main_suggestion) if lv_dist > _MAX_LEVENSHTEIN_DISTANCE: # hm, this word looks like it's in a bad shape continue logging.debug("Spell checking: Replacing: %s -> %s" % (word, main_suggestion)) # let's replace the word by its suggestion pre_txt = txt[:word_pos + offset] post_txt = txt[word_pos + len(word) + offset:] txt = pre_txt + main_suggestion + post_txt offset += (len(main_suggestion) - len(word)) # fixed words may be a good hint for orientation score += 5 return (txt, score) finally: _ENCHANT_LOCK.release()
def check_spelling(spelling_lang, txt): """ Check the spelling in the text, and compute a score. The score is the number of words correctly (or almost correctly) spelled, minus the number of mispelled words. Words "almost" correct remains neutral (-> are not included in the score) Returns: A tuple : (fixed text, score) """ if os.name == "nt": assert (not "check_spelling() not available on Windows") return with _ENCHANT_LOCK: # Maximum distance from the first suggestion from python-enchant words_dict = enchant.request_dict(spelling_lang) try: tknzr = enchant.tokenize.get_tokenizer(spelling_lang) except enchant.tokenize.TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tknzr = enchant.tokenize.get_tokenizer() score = 0 offset = 0 for (word, word_pos) in tknzr(txt): if len(word) < _MIN_WORD_LEN: continue if words_dict.check(word): # immediately correct words are a really good hint for # orientation score += 100 continue suggestions = words_dict.suggest(word) if (len(suggestions) <= 0): # this word is useless. It may even indicates a bad orientation score -= 10 continue main_suggestion = suggestions[0] lv_dist = Levenshtein.distance(word, main_suggestion) if (lv_dist > _MAX_LEVENSHTEIN_DISTANCE): # hm, this word looks like it's in a bad shape continue logger.debug("Spell checking: Replacing: %s -> %s" % (word, main_suggestion)) # let's replace the word by its suggestion pre_txt = txt[:word_pos + offset] post_txt = txt[word_pos + len(word) + offset:] txt = pre_txt + main_suggestion + post_txt offset += (len(main_suggestion) - len(word)) # fixed words may be a good hint for orientation score += 5 return (txt, score)
def check_spelling(ocr_lang, txt): """ Check the spelling in the text, and compute a score. The score is the number of words correctly (or almost correctly) spelled, minus the number of mispelled words. Words "almost" correct remains neutral (-> are not included in the score) Returns: A tuple : (fixed text, score) """ # Maximum distance from the first suggestion from python-enchant MAX_LEVENSHTEIN_DISTANCE = 1 MIN_WORD_LEN = 4 # TODO(Jflesch): We are assuming here that we can figure out the best # dictionary based on the 3 letters OCR lang. This is a bad assumption try: language = pycountry.languages.get(terminology=ocr_lang[:3]) except KeyError: language = pycountry.languages.get(bibliographic=ocr_lang[:3]) spelling_lang = language.alpha2 words_dict = enchant.request_dict(spelling_lang) try: tknzr = enchant.tokenize.get_tokenizer(spelling_lang) except enchant.tokenize.TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tknzr = enchant.tokenize.get_tokenizer() score = 0 offset = 0 for (word, word_pos) in tknzr(txt): if words_dict.check(word): score += 1 continue if len(word) < MIN_WORD_LEN: continue suggestions = words_dict.suggest(word) if (len(suggestions) <= 0): score -= 1 continue main_suggestion = suggestions[0] lv_dist = Levenshtein.distance(word, main_suggestion) if (lv_dist > MAX_LEVENSHTEIN_DISTANCE): continue print "Spell checking: Replacing: %s -> %s" % (word, main_suggestion) # let's replace the word by its suggestion pre_txt = txt[:word_pos + offset] post_txt = txt[word_pos + len(word) + offset:] txt = pre_txt + main_suggestion + post_txt offset += (len(main_suggestion) - len(word)) return (txt, score)
def find_missing_dict(lang): if os.name == "nt": return [] import enchant missing = [] try: enchant.request_dict(lang['aspell']) except: # noqa: E722 missing.append( ( 'Dictionary', '(none)', { 'debian': ('aspell-%s' % lang['aspell']), 'fedora': ('aspell-%s' % lang['aspell']), 'gentoo': ('aspell-%s' % lang['aspell']), 'linuxmint': ('aspell-%s' % lang['aspell']), 'ubuntu': ('aspell-%s' % lang['aspell']), } ) ) return missing
def correct_spelling(word, lang = LANG_EN): """ :param word: the word requiring correction :param lang: language of the word :return: the most probable correct version of the input word """ import enchant d = enchant.request_dict(lang) if not d.check(word): return d.suggest(word)[0] return word
def runThruDictionary(words): # check against en_US spelling endict = enchant.request_dict("en_US") dictWords = [] for w in words: if endict.check(w): dictWords.append(w) else: suggestions = endict.suggest(w) # just append the first one if len(suggestions) > 0: dictWords.append(suggestions[0]) # else we have unrecoverable garbage? return dictWords
def spell_check(self, post, lang): """ Check spellings for the given post and given language. """ try: dictionary = enchant.request_dict(lang) checker = SpellChecker(lang, filters=[EmailFilter, URLFilter]) checker.set_text(post.text(lang=lang, strip_html=True)) words = [error.word for error in checker] words = [ word for word in words if not dictionary.check(word) ] LOGGER.notice( 'Mis-spelt words in %s: %s' % ( post.fragment_deps(lang), ', '.join(words) ) ) except enchant.DictNotFoundError: LOGGER.notice('No dictionary found for %s' % lang)
def correct_words(word_list): """Takes a list of strings and tries to correct them so that they are valid words (i.e. alphabetical). Correction Procedure: (1) make lowercase -> (2) delete string if not alphabetical (allowed to contain an apostrophe or punctuation at the end) -> (3) remove trailing punctuation -> (4) break up contractions, split and take original word -> (5) remove possessive 's -> (6) check if word is in dictionary, if not, attempt to correct it to most likely correct word, and if none can be found, delete it. Returns list of corrected words""" english_dict = enchant.request_dict("en_US") # open up the english dictionary index = 0 while index <= len(word_list) - 1: # make lowercase word_list[index] = word_list[index].lower() # remove values if they aren't words (alphabetical, can end with punctuation) if not re.match('^[a-z]+[?,;.!]*$', word_list[index]): del word_list[index] index -= 1 # it is a word - clean it up. else: # remove punctuation if it appears at the end of a word. word_list[index] = word_list[index].rstrip(string.punctuation) # break up contractions using contraction library (chooses most likely conversion - can be mistaken word_list[index] = contractions.fix(word_list[index]) # split and remove contraction word_list[index] = word_list[index].split()[0] # remove "'s" at the ends of words (contraction library doesn't remove possessive 's) word_list[index] = re.sub("\'s$", '', word_list[index]) # spell check words if not english_dict.check(word_list[index]): suggestions = english_dict.suggest(word_list[index]) if len(suggestions) > 0: word_list[index] = suggestions[0].lower().split()[0] else: del word_list[index] index -= 1 index += 1 for item in word_list: if not item.isalpha(): word_list.remove(item) return word_list
def __init__(self, document, parent=None): super(Spellcheck, self).__init__(parent) self.createUI() if document is None: return else: self.doc = document.toPlainText() # copy the document text and strip out HTML, URL's and Email addresses tokens = get_tokenizer("en_US", chunkers=(HTMLChunker,), filters=[EmailFilter, URLFilter]) self.editDoc = [] # tuples go into this list for word in tokens(self.doc): self.editDoc.append(word) self.wordsToCheck = dict((t[0], i) for i, t in enumerate(self.editDoc)) # >>> Output self.wordsToCheck , unit Test with 10 cases self.wordlist = enchant.request_dict("en_GB") self.misspeltList = [] for key in self.wordsToCheck.keys(): self.checkWord(key) # >>> Plonk a test here self.highlightMisspelt(self.misspeltList[Spellcheck.index:])
import pygame #from sys import argv from gtts import gTTS from enchant import request_dict try: import Image except ImportError: from PIL import Image import pytesseract #from tesseract import image_to_string sentence = pytesseract.image_to_string(Image.open('saying.jpg')) gb_dict = request_dict('en_gb') us_dict = request_dict('en_us') new = ''.join(sentence) sen = new.split() for word in sen: if gb_dict.check(word) or us_dict.check(word): print word, " " tts = gTTS(text=word, lang='en') tts.save("result.mp3") file = 'result.mp3' pygame.init() pygame.mixer.init() pygame.mixer.music.load(file) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pygame.time.Clock().tick(10) else: print " %s " % \
sep=',', encoding='utf-8') print df #Summarizion text = '' for sent in df['original_sents'].values: text += '.' + sent summarize = Summarization(text, None, senti) final_summary1, final_summary2, final_summary3, neg_final_summary1, neg_final_summary2, neg_final_summary3, counts, eigen_explo = summarize.get_summaries( ) print final_summary1, final_summary2, final_summary3, neg_final_summary1, neg_final_summary2, neg_final_summary3, counts, eigen_explo if __name__ == "__main__": d = spell.request_dict("en_US") nlp = spacy.load('en') senti = SentimentAnalysis() start_time = time.time() filenames = [ f for f in listdir(input_entity_files) if isfile(join(input_entity_files, f)) ] #Parellel on CPU cores #Parallel(n_jobs=cpu_count() - 1, verbose=10, backend="multiprocessing", batch_size="auto")(delayed(processFiles)(fileName,input_entity_files) for fileName in filenames) #for (dirpath, dirnames, filenames) in walk(input_entity_files): for file in filenames: processFiles(file, input_entity_files)
#!/usr/bin/env python # -*- coding: utf-8 -*- import enchant d = enchant.request_dict("fr_FR") #x= "en organisation le savent — lk est d'une extrême banalité. Des person fait bonjour hello" x = "eu l’occasion d‘évo" #1.deliminate the ordinair errors, there are two kinds of ord err: 1,we have to delete it 2,we have to replace it # for the first kind we delete them first #2.for the second kind of err, we can correct them before check it by enchant! #2.check the word #4.replace the the word to abbreviation #this the first kind of err, we have to delete them first error_ens = {''} file = open("error_1") while 1: line = file.readline() if str(line) != '': error_ens.add(str(line).strip()) if not line: break replace_dic = {'bonjour': 'bg'} file = open("replace") while 1: line1 = file.readline() if not line1: break if str(line1).strip() != '':
import enchant import numpy as np import sys from nltk import word_tokenize, pos_tag, sent_tokenize, RegexpTokenizer from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from gensim import corpora, models, similarities from collections import Counter from numpy import genfromtxt questions = [] vocabQuesCount = Counter() tokenizer = RegexpTokenizer(r'\w+') stop = stopwords.words('english') dictionary = enchant.request_dict("en_US") stemmer = PorterStemmer() def clean_ques(ques): ques = ques.lower() ques = tokenizer.tokenize(ques) # for i in range(len(ques)): # if not enchant.dict_exists(ques[i]): # ques[i] = dictionary.suggest(ques[i])[0] ques = [stemmer.stem(q) for q in ques] return ques with open('../data/clean.json') as data_file: data = json.load(data_file)
import csv import matplotlib.pyplot as plt import enchant enchant.request_dict("en_US") # get the dictionary path_male = 'ANEW/male.csv' path_female = 'ANEW/female.csv' male_words = [] male_valence_mean = [] male_valence_std = [] male_arousal_mean = [] male_arousal_std = [] female_words = [] female_valence_mean = [] female_valence_std = [] female_arousal_mean = [] female_arousal_std = [] def create_data(): """ Adds all the data from csv files to the respective lists """ global male_words with open(path_male) as f: mreader = csv.reader(f) for row in mreader: male_words.append(row[0]) male_valence_mean.append(float(row[1])) male_valence_std.append(float(row[2])) male_arousal_mean.append(float(row[3]))
def check_spelling(ocr_lang, txt): """ Check the spelling in the text, and compute a score. The score is the number of words correctly (or almost correctly) spelled, minus the number of mispelled words. Words "almost" correct remains neutral (-> are not included in the score) Returns: A tuple : (fixed text, score) """ global _ENCHANT_LOCK _ENCHANT_LOCK.acquire() try: # Maximum distance from the first suggestion from python-enchant MAX_LEVENSHTEIN_DISTANCE = 1 MIN_WORD_LEN = 4 # TODO(Jflesch): We are assuming here that we can figure out the best # dictionary based on the 3 letters OCR lang. This is a bad assumption try: language = pycountry.languages.get(terminology=ocr_lang[:3]) except KeyError: language = pycountry.languages.get(bibliographic=ocr_lang[:3]) spelling_lang = language.alpha2 words_dict = enchant.request_dict(spelling_lang) try: tknzr = enchant.tokenize.get_tokenizer(spelling_lang) except enchant.tokenize.TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tknzr = enchant.tokenize.get_tokenizer() score = 0 offset = 0 for (word, word_pos) in tknzr(txt): if len(word) < MIN_WORD_LEN: continue if words_dict.check(word): # immediately correct words are a really good hint for orientation score += 100 continue suggestions = words_dict.suggest(word) if (len(suggestions) <= 0): # this word is useless. It may even indicates a bad orientation score -= 10 continue main_suggestion = suggestions[0] lv_dist = Levenshtein.distance(word, main_suggestion) if (lv_dist > MAX_LEVENSHTEIN_DISTANCE): # hm, this word looks like it's in a bad shape continue print "Spell checking: Replacing: %s -> %s" % (word, main_suggestion) # let's replace the word by its suggestion pre_txt = txt[:word_pos + offset] post_txt = txt[word_pos + len(word) + offset:] txt = pre_txt + main_suggestion + post_txt offset += (len(main_suggestion) - len(word)) # fixed words may be a good hint for orientation score += 5 return (txt, score) finally: _ENCHANT_LOCK.release()
import pygame #from sys import argv from gtts import gTTS from enchant import request_dict try: import Image except ImportError: from PIL import Image import pytesseract #from tesseract import image_to_string sentence=pytesseract.image_to_string(Image.open('saying.jpg')) gb_dict = request_dict('en_gb') us_dict = request_dict('en_us') new=''.join(sentence) sen=new.split() for word in sen: if gb_dict.check(word) or us_dict.check(word): print word, " " tts=gTTS(text=word,lang='en') tts.save("result.mp3") file='result.mp3' pygame.init() pygame.mixer.init() pygame.mixer.music.load(file) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pygame.time.Clock().tick(10) else: print " %s " % \
import string import collections import copy import enchant from utils import * from constants import * from letter import Letter enchant = enchant.request_dict("en_US") alphabet = list(string.ascii_lowercase) transDict = {'a': "_", 'b': "_", 'c': "_", 'd': "_", 'e': "_", 'f': "_", 'g': "_", 'h': "_", 'i': "_", 'j': "_", 'k': "_", 'l': "_", 'm': "_", 'n': "_", 'o': "_", 'p': "_", 'q': "_", 'r': "_", 's': "_", 't': "_", 'u': "_", 'v': "_", 'w': "_", 'x': "_", 'y': "_", 'z': "_", " ": " "} transDict_de = {'a': "_", 'b': "_", 'c': "_", 'd': "_", 'e': "_", 'f': "_", 'g': "_", 'h': "_", 'i': "_", 'j': "_", 'k': "_", 'l': "_", 'm': "_", 'n': "_", 'o': "_", 'p': "_", 'q': "_", 'r': "_", 's': "_", 't': "_", 'u': "_", 'v': "_", 'w': "_", 'x': "_", 'y': "_", 'z': "_", 'ä': "_", 'ö': "_", 'ü': "_", 'ß': "_", " ": " "} def decrypt(cipher, dictionary): word_text = '' for character in cipher: if character in dictionary: word_text += dictionary[character]
def fetchSuggestion(keyword): d = enchant.request_dict("en_US") return map(lambda w: (w, w), d.suggest(keyword))
import nltk import csv, collections from nltk import word_tokenize, pos_tag, sent_tokenize, RegexpTokenizer from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer import enchant import numpy as np from numpy import genfromtxt import sys tokenizer = RegexpTokenizer(r'\w+') stop = stopwords.words('english') dictionary = enchant.request_dict("en_US") stemmer = SnowballStemmer("english") def clean_ques(ques): ques = ques.lower() ques = tokenizer.tokenize(ques) for i in range(len(ques)): if not enchant.dict_exists(ques[i]): ques[i] = dictionary.suggest(ques[i])[0] #ques = [q for q in ques if q not in stop] ques = [stemmer.stem(q) for q in ques if q not in stop] return ques
def setDictionary(self): import dictLangDialog dialog = dictLangDialog.DictLangDialog() try: dialog.show() lang = dialog.selectedLang if lang == "United States" and enchant.dict_exists("en_US"): self.wordlist = enchant.request_dict("en_US") elif lang == "Chinese" and enchant.dict_exists("zh"): self.wordlist = enchant.request_dict("zh") elif lang == "Russian" and enchant.dict_exists("ru"): self.wordlist = enchant.request_dict("ru") elif lang == "German" and enchant.dict_exists("de_DE"): self.wordlist = enchant.request_dict("de_DE") elif lang == "French" and enchant.dict_exists("fr_FR"): self.wordlist = enchant.request_dict("fr_FR") elif lang == "Norwegian" and enchant.dict_exists("no"): self.wordlist = enchant.request_dict("no") elif lang == "Zulu" and enchant.dict_exists("zu"): self.wordlist = enchant.request_dict("zu") elif lang == "Arabic" and enchant.dict_exists("ar"): self.wordlist = enchant.request_dict("ar") elif lang == "Hindi" and enchant.dict_exists("hi"): self.wordlist = enchant.request_dict("hi") elif lang == "British" and enchant.dict_exists("en_GB"): self.wordlist = enchant.request_dict("en_GB") except: self.wordlist = enchant.request_dict("en_GB")
def main(): global g_lang global g_dictionnary global g_tknzr global g_nb_total_pages global g_start_time print("Will use {} for OCR".format(OCR_TOOL.get_name())) print("Initializing dictionnary ...") g_lang = "eng" if len(sys.argv) > 1: g_lang = "fra" g_dictionnary = enchant.request_dict(g_lang[:2]) try: g_tknzr = enchant.tokenize.get_tokenizer(g_lang[:2]) except enchant.tokenize.TokenizerNotFoundError as exc: print("Warning: Falling back to default tokenizer ({})".format(exc)) g_tknzr = enchant.tokenize.get_tokenizer() print("Done") print("Loading documents list ...") pconfig = config.PaperworkConfig() pconfig.read() work_dir = pconfig.settings['workdir'].value dsearch = docsearch.DocSearch(work_dir) dsearch.reload_index() print("Documents loaded") print("") print("Initalizing workers ...") manager = WorkerManager() manager.start() factory = JobFactoryImageProcessing() print("Done") g_start_time = datetime.datetime.now() try: print("Queueing jobs ...") nb_docs = 0 nb_pages = 0 for doc in dsearch.docs: if not doc.can_edit: # probably not an OCR-ized doc continue nb_docs += 1 for page in doc.pages: if not page.can_edit: # probably not an OCR-ized page continue nb_pages += 1 g_nb_total_pages += 1 for algos in ALGORITHMS: job = factory.make(page, algos) manager.schedule(job) print("Queued jobs : {} docs | {} pages".format(nb_docs, nb_pages)) manager.wait_for_all() finally: manager.stop()
def __init__(self, languages=None): self.dictionaries = OrderedDict() for language in languages or self.languages: self.dictionaries[language] = request_dict(language) self.normalizer = PunctuationRemover() self.tokenizer = SpaceTokenizer()