def spelling(file_name, contents, language="en_US"): """ You give it a file_name and the contents of that file and it tells you if it's spelled correctly. The reason you give it contents is that you will typically run a template through the render process, so spelling can't just load a file and check it. It assumes you have PyEnchant installed correctly and configured in your config/ file. Use "salmon spell" to make sure it works right. """ try: from enchant.checker import SpellChecker from enchant.tokenize import EmailFilter, URLFilter except ImportError: print "Failed to load PyEnchant. Make sure it's installed and salmon spell works." return True failures = 0 chkr = SpellChecker(language, filters=[EmailFilter, URLFilter]) chkr.set_text(contents) for err in chkr: print "%s: %s \t %r" % (file_name, err.word, contents[err.wordpos - 20:err.wordpos + 20]) failures += 1 if failures: print "You have %d spelling errors in %s. Run salmon spell.." % (failures, file_name) return False else: return True
def spell_checker(text, dict_language='ru_RU', answer_type='cutted'): word_checker = enchant.Dict(dict_language) text_checker = SpellChecker('ru_RU') text_proc = text.split() start_time = print(text) for position in range(len(text_proc) - 1): text_checker.set_text(text_proc[position]) for err in text_checker: if len(err.word) > 1: fixed = word_checker.suggest(err.word) #print ('ERROR:', err.word, ', replaced by:', fixed) if err.word[0].isupper(): result = names.search_by_name(err.word, fixed) if result == 0: fixed = freq.get_freq(fixed, err.word) else: fixed[0] = result else: fixed = freq.get_freq(fixed, err.word) if len(fixed) != 0: if answer_type == 'cutted': text_proc[position] = re.sub( r'\b{}\b'.format(re.escape(err.word)), fixed[0], text_proc[position]) if answer_type == 'full': text_proc[position] = re.sub( r'\b{}\b'.format(re.escape(err.word)), str(fixed), text_proc[position]) print('CHECK TIME', - start_time) answer = ' '.join(text_proc) return answer
def __init__(self, language): try: self.dict = enchant.Dict(language) self.checker = SpellChecker( language, filters=[EmailFilter, URLFilter, CamelCaseFilter]) except enchant.DictNotFoundError: self.checker = None
def __init__(self): self._url_list = [] self._image_url_list = [] self._image_list = [] self._reddit = reddit = praw.Reddit('bot1') self._subreddit = reddit.subreddit("getmotivated") self._counter = 0 self._text_array = [] self._spellchecked_array = [] self._context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) self._enchant_dict = enchant.Dict("en_US") self._chkr = SpellChecker("en_US") self._size = 1800, 3200 self._wrong_words = 0 self._word_conferences = [] self._new_average = 0
class EnchantSpell: def __init__(self): print enchant.Broker().list_dicts() print enchant.Broker().list_languages() self.__spell_checker__ = SpellChecker(lang='en_US') def check_token(self, token): """ Check spelling of a single word :param token: input word :return: True/False """ return self.__spell_checker__.check( token) or self.__spell_checker__.check(token.capitalize()) def check_tokens(self, tokens): """ Check spelling of a list of words :param tokens: input word list :return: a list of index of mis-spelled words """ error_idx = [] for tid, token in enumerate(tokens): if not self.check_token(token): error_idx.append(tid) return error_idx def suggest_correction(self, token): """ Suggest correction to a mis-spelled word :param token: input word :return: list of suggestion """ return self.__spell_checker__.suggest(token)
def __init__(self, text, dictionary): super(spellChecker, self).__init__() log.debug("Creating the SpellChecker object. Dictionary: %s" % (dictionary,)) = True try: if["app-settings"]["language"] == "system": log.debug("Using the system language") self.checker = SpellChecker(filters=[twitterFilter.TwitterFilter, tokenize.EmailFilter, tokenize.URLFilter]) else: log.debug("Using language: %s" % (languageHandler.getLanguage(),)) self.checker = SpellChecker(languageHandler.getLanguage(), filters=[twitterFilter.TwitterFilter, tokenize.EmailFilter, tokenize.URLFilter]) self.checker.set_text(text) except DictNotFoundError: log.exception("Dictionary for language %s not found." % (dictionary,)) wx_ui.dict_not_found_error() = False if == True: log.debug("Creating dialog...") self.dialog = wx_ui.spellCheckerDialog() widgetUtils.connect_event(self.dialog.ignore, widgetUtils.BUTTON_PRESSED, self.ignore) widgetUtils.connect_event(self.dialog.ignoreAll, widgetUtils.BUTTON_PRESSED, self.ignoreAll) widgetUtils.connect_event(self.dialog.replace, widgetUtils.BUTTON_PRESSED, self.replace) widgetUtils.connect_event(self.dialog.replaceAll, widgetUtils.BUTTON_PRESSED, self.replaceAll) self.check() self.dialog.get_response() self.fixed_text = self.checker.get_text()
def spell_check(request): """ Implements the TinyMCE 4 spellchecker protocol :param request: Django http request with JSON-RPC payload from TinyMCE 4 containing a language code and a text to check for errors. :type request: django.http.request.HttpRequest :return: Django http response containing JSON-RPC payload with spellcheck results for TinyMCE 4 :rtype: django.http.HttpResponse """ data = json.loads(request.body.decode('utf-8')) output = {'id': data['id']} error = None try: import enchant from enchant.checker import SpellChecker if data['params']['lang'] not in enchant.list_languages(): error = 'Missing {0} dictionary!'.format(data['params']['lang']) raise RuntimeError(error) checker = SpellChecker(data['params']['lang']) checker.set_text(strip_tags(data['params']['text'])) output['result'] = {checker.word: checker.suggest() for err in checker} except ImportError: error = 'The pyenchant package is not installed!' logger.exception(error) except RuntimeError: logger.exception(error) except Exception: error = 'Unknown error!' logger.exception(error) if error is not None: output['error'] = error return HttpResponse(json.dumps(output), content_type='application/json; charset=UTF-8')
def testAccuracy(inputString): errors = 0 chkr = SpellChecker("en_US") chkr.set_text(inputString) for err in chkr: errors += 1 return errors
def correctText(text): print("*"*78) mistakes = 0 text = text.casefold() chkr = SpellChecker("en_US",filters=[EmailFilter,URLFilter]) chkr.set_text(text) for err in chkr: ww = get_close_matches(err.word,data.keys(),n=5) list2String = ", ".join(ww) if len(ww) > 0 : print("[-] " + err.word + " Is Not an English Word, Do U Mean : " + list2String + " ?") mistakes += 1 else: print("[-] " + err.word + " Is Not an English Word, But I Don't Know What The F**k Do u Mean") mistakes += 1 if mistakes == 0: print("[+] Its All Correct !") else: print("*"*78) print("[-] You Have " + str(mistakes) + " Mistakes !")
class SpellCheckHighlighter(QSyntaxHighlighter): def __init__(self, parent, language): QSyntaxHighlighter.__init__(self, parent) self._language = language if not enchantAvailable: self._checker = None return try: self._checker = SpellChecker(self._language) except enchant.DictNotFoundError: self._checker = None'SpellChecking: No dictionary available for language "%s"') % self._language) self._format = QTextCharFormat() self._format.setUnderlineColor(QColor(; self._format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline); def highlightBlock(self, text): if not enchantAvailable or not self._checker: return text = unicode(text) self._checker.set_text( text ) for error in self._checker: self.setFormat(error.wordpos, len(error.word), self._format)
def spellcheck(text): #remove all special characters apart from characterset below text = re.sub('[^A-Za-z0-9.,!?\s\\xf6\\xfc\\xe4\\xdf]', '', text) #spell checking chkr = SpellChecker("de_CH", text) lasterror = '' lastlasterror = '' Nerr = 0 for err in chkr: repl = '' if lastlasterror == err.word: continue if lasterror == err.word: continue lastlasterror = lasterror lasterror = err.word Nerr += 1 repl = err.suggest() try: err.replace(repl[0]) except: err.replace(u"spellchecker_fail") text = chkr.get_text() return text
class SpellCheckThread(threading.Thread): def __init__(self, root_widget: App, name: str = "<\'SpellCheckThread\'>"): threading.Thread.__init__(self, name=name) self.thread_name = name self.language = "en_US" self.dictionary = SpellChecker( self.language ) # or import enchant and use Dict() to check individual words self.thread_flag = True self.setDaemon(True) self.root_widget = root_widget self.lock_object = threading.Lock() # tag config for misspelled words root_widget.text_area.tag_config("misspelled", foreground="red", underline=True) def run(self): while self.thread_flag: time.sleep(1) self.lock_object.acquire() input_text: str = self.root_widget.text_area.get("1.0", "end-1c") self.dictionary.set_text(input_text) list_of_words = input_text.split() index = "1.0" for word in list_of_words: index =, index, nocase=1, stopindex="end") if not index: break last_index = "%s+%dc" % (index, len(word)) self.root_widget.text_area.tag_remove("misspelled", index, last_index) index = last_index index = "1.0" for error in self.dictionary: index =, index, nocase=1, stopindex="end") if not index: break last_index = "%s+%dc" % (index, len(error.word)) self.root_widget.text_area.tag_add("misspelled", index, last_index) index = last_index self.lock_object.release() def start_spell_check(self): self.start() def stop_spell_check(self): self.thread_flag = False
def spell(text): """<word/sentence> - Check spelling of a word or sentence.""" if len(text.split(" ")) > 1: # input is a sentence checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter]) checker.set_text(text) is_correct = True offset = 0 for err in checker: is_correct = False # find the location of the incorrect word start = err.wordpos + offset finish = start + len(err.word) # get some suggestions for it suggestions = err.suggest() s_string = '/'.join(suggestions[:3]) s_string = "[h1]{}[/h1]".format(s_string) # calculate the offset for the next word offset = (offset + len(s_string)) - len(err.word) # replace the word with the suggestions text = text[:start] + s_string + text[finish:] return "$(green)Correct$(c)" if is_correct else text else: # input is a word is_correct = en_dict.check(text) suggestions = en_dict.suggest(text) s_string = ', '.join(suggestions[:10]) return '"{}" appears to be {} [div] [h1]Similar:[/h1] {}'.format( text, "$(green)correct$(c)" if is_correct else "$(red)incorrect$(c)", s_string)
def spellcheck(text): #getting spellcheck chkr = SpellChecker("en_US") chkr.set_text(text) c = 0 for _ in chkr: c = c + 1 return c
def __init__(self, sc_language="en_US", bert_model="distilbert-base-uncased"): = SpellChecker(sc_language) self.tokenizer = AutoTokenizer.from_pretrained(bert_model) self.model = AutoModelForMaskedLM.from_pretrained(bert_model) print("> BERT loaded")
def retSpErrCount(self): chkr = SpellChecker('en_US','en_GB') chkr.set_text(self.file_content) count = 0 for err in chkr: count += 1 return count
def control_noerrorwords(textInput, jdlist): """ """ words = [] checker = SpellChecker('en_US') checker.set_text(textInput) for errorword in checker: words.append(errorword.word) words = list(set(words)) crossWords = set(words).difference(set(jdlist)) count = len(crossWords) cvtext = textInput pWordsFile = open('Utility_MyWords.txt', 'r') pWords = set(list(lib_str.split(lib_re.sub('\n',' ', \,' '))) pWordsFile.close() crossWords = crossWords.difference(pWords) if count == 0: for e in words: cvtext = lib_re.sub('(\s+' + e + '\s)', ' ', cvtext) else: for e in crossWords: cvtext = lib_re.sub('(\s+' + e + '\s)', ' ', cvtext) return cvtext
def spell_check(request): """ Implements the TinyMCE 4 spellchecker protocol :param request: Django http request with JSON-RPC payload from TinyMCE 4 containing a language code and a text to check for errors. :type request: django.http.request.HttpRequest :return: Django http response containing JSON-RPC payload with spellcheck results for TinyMCE 4 :rtype: django.http.HttpResponse """ data = json.loads(request.body.decode("utf-8")) output = {"id": data["id"]} error = None try: import enchant from enchant.checker import SpellChecker if data["params"]["lang"] not in enchant.list_languages(): error = "Missing {} dictionary!".format(data["params"]["lang"]) raise RuntimeError(error) checker = SpellChecker(data["params"]["lang"]) checker.set_text(strip_tags(data["params"]["text"])) output["result"] = {checker.word: checker.suggest() for err in checker} except ImportError: error = "The pyenchant package is not installed!" logger.exception(error) except RuntimeError: logger.exception(error) except Exception: error = "Unknown error!" logger.exception(error) if error is not None: output["error"] = error return JsonResponse(output)
def process_single_language(schema_name, language): "Calculating spelling errors for schema {} using language {}".format( schema_name, language)) #load spell checker for this language try: chkr = SpellChecker(language) except: logger.warn("Error using dictionary {}, skipping".format(language)) return query = """select id, post_text from forum_posts""" results = get_connection(schema_name).execute(query) insert_values = [] for row in results: logger.debug( "Working on course {}, post {}, with a length of length {} using language {}" .format(schema_name, row[0], len(row[1]), language)) chkr.set_text(row[1]) cnt = 0 for err in chkr: cnt += 1 stats = { "course_id": schema_name, "forum_posts_id": row[0], "language": language, "score": cnt } insert_values.append(stats) if len(insert_values) == BATCH_SIZE: batch_insert(conn, tbl_coursera_message_language, insert_values) insert_values = [] batch_insert(conn, tbl_coursera_message_language, insert_values) insert_values = []
def spell(text): """spell <word/sentence> -- Check spelling of a word or sentence.""" if len(text.split(" ")) > 1: # input is a sentence checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter]) checker.set_text(text) offset = 0 for err in checker: # find the location of the incorrect word start = err.wordpos + offset finish = start + len(err.word) # get some suggestions for it suggestions = err.suggest() s_string = '/'.join(suggestions[:3]) s_string = "\x02{}\x02".format(s_string) # calculate the offset for the next word offset = (offset + len(s_string)) - len(err.word) # replace the word with the suggestions text = text[:start] + s_string + text[finish:] return text else: # input is a word is_correct = en_dict.check(text) suggestions = en_dict.suggest(text) s_string = ', '.join(suggestions[:10]) if is_correct: return '"{}" appears to be \x02valid\x02! ' \ '(suggestions: {})'.format(text, s_string) else: return '"{}" appears to be \x02invalid\x02! ' \ '(suggestions: {})'.format(text, s_string)
def decipher(ciphertext): checker = SpellChecker("en_US") best_key = -1 least_num_errors = len(ciphertext) for i in range(0, 26): plaintext = '' for j in range(0, len(ciphertext)): if ciphertext[j].isalpha(): pos = ord(ciphertext[j]) offset = 97 if ciphertext[j].isupper(): offset = 65 new_pos = (pos - offset + 26 - i) % 26 + offset plaintext += unichr(new_pos) else: plaintext += ciphertext[j] checker.set_text(plaintext) num_errors = 0 for err in checker: num_errors = num_errors + 1 if num_errors < least_num_errors: least_num_errors = num_errors best_key = i words = plaintext.split() en_words = len(words) - num_errors print("%i: %s English words: %i" % (i, plaintext, en_words)) return "%s %i" % ("The key is most likely: ", best_key)
def CorrectBot(interface,command,args,messagetype): "!correct [n=previous] [words=all of them] - Try to correct the words in the nth message before this one." words='' chkr = SpellChecker("en_UK",filters=[EmailFilter,URLFilter]) try: a=args.split() n=int(args.split()[0]) if len(a)>1: words=args.partition(" ")[2] else: words=interface.LastMessages[n].Body except: n=0 words=interface.LastMessages[0].Body if not interface.LastMessages[n].IsEditable: SpellBot(interface,'spell',words,messagetype,onlyerror=True) return text=interface.LastMessages[n].Body origtext = text chkr.set_text(words) for err in chkr: w = err.suggest()#Spell(word[0]) if w: if w!=err.word: text = text.replace(err.word,w[0]) if origtext!=text: interface.LastMessages[n].Body=text
def read_lang(self, image_name, text): if self.model.readLang(text) == 'en': filename = self.model.getFileName() text = image_to_string( text_original = str(text) # cleanup text text = text_replace(text) persons_list = get_persons_list(text) ignore_words = persons_list + ["!", ",", ".", "\"", "?", '(', ')', '*', '\''] # using enchant.checker.SpellChecker, identify incorrect words spell = SpellChecker("en_US") words = text.split() incorrect_words = [w for w in words if not spell.check(w) and w not in ignore_words] # using enchant.checker.SpellChecker, get suggested replacements suggested_words = [spell.suggest(w) for w in incorrect_words] # replace incorrect words with [MASK] text, text_original = replace_incorrect(incorrect_words, text, text_original) id_mask, model, segments_tensor, tokenizer, tokens_tensor = evaluate_tokens(text) # Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensor) # Refine prediction by matching with proposals from SpellChecker text = predict_word(id_mask, tokenizer, suggested_words, text_original, predictions) else: text = pytesseract.image_to_string(, config='-l jpn') text = re.sub(" ", "", text) return text
def spellcheck_text(self): """Spellcheckes text and saves spellchecked text in self.text. """ # Variable declaration errors = list() # spelling errors chkr = SpellChecker('de_DE') # spellchecker for whole text dic = enchant.Dict('de_DE') # enchant dict # Run spellchecker over whole text chkr.set_text(self.text) # Loop over every error for err in chkr: # Save error in errors list errors.append(err.word) # There are errors if len(errors) > 0: # Replace errors with proper word for error in errors: # Check if there is a substitute try: self.text = self.text.replace(error, dic.suggest(error)[0]) except IndexError: pass
def filter_ngrams(terms, spelling=False, singletons=True, contains_numeric=False, contains_alpha=False, contains_non_alphanumeric=False): """ Filter n-grams by a variety of features """ chkr = SpellChecker("en_US") print(len(terms), "n-grams before filter") if spelling: for k in list(terms.keys()): chkr.set_text(k) errors = set() for err in chkr: errors.add(err.word) if len(errors) > 0: del terms[k] if singletons: for k,v in list(terms.items()): if len(v) == 1: del terms[k] if contains_numeric: for k in list(terms.keys()): if"[^0-9]",k): del terms[k] if contains_alpha: for k in list(terms.keys()): if"[^a-z]",k): del terms[k] if contains_non_alphanumeric: for k in list(terms.keys()): if"[^[:alnum:]]",k): del terms[k] print(len(terms), "n-grams after filter") return terms
def matchlocations(line): from enchant.checker import SpellChecker chkr = SpellChecker("en_US") global sdx global sdx4 global sdx3 global places i=0 chkr.set_text(line) for err in chkr: #print "error",err.word toprint = err.word word = err.word.upper() mind = 4 replace = [] flag = False soundFlag = False noFlag = False if word in places: line = line.replace(toprint,"<loc>"+word.lower()+"</loc>") else: for place in places: dist = minEditDist(word,place) if dist<mind: replace=[] replace.append(place) mind = dist flag = True else: if dist == mind: replace.append(place) flag == True if flag == True and len(word) > mind: if mind ==1 and len(replace)==1: line = line.replace(toprint,"<loc>"+replace[0].lower()+"</loc>") else: if(soundex2(word,4) in sdx4 and len(word)>3): line = line.replace(toprint,"<loc>"+sdx4[soundex2(word,4)].lower()+"</loc>") elif(soundex2(word,3) in sdx3 and len(word)>3): line = line.replace(toprint,"<loc>"+sdx3[soundex2(word,3)].lower()+"</loc>") elif(dm(word)[0] in sdx and len(word)>3): line = line.replace(toprint,"<loc>"+sdx[dm(word)[0]].lower()+"</loc>") else: if len(replace) == 1: line = line.replace(toprint,"<loc>"+replace[0].lower()+"</loc>") else: #print replace for ele in replace: if(dm(ele)[0] == dm(toprint)[0]): line = line.replace(toprint,"<loc>"+ele.lower()+"</loc>") soundFlag = True break if soundFlag == False: line = line.replace(toprint,"<loc>"+replace[0].lower()+"</loc>") else: if (dm(word)[0] in sdx and len(word)>3): line = line.replace(toprint,"<loc>"+sdx[dm(word)[0]].lower()+"</loc>") line = line.replace('\r','') print line
def spellcheck_hints(args, packages): spelldict = DictWithPWL('en-US') chkr = SpellChecker(spelldict, filters=[DescFilter]) misspellings = {} # add technical words not in spell-checking dictionary wordlist = [] with open('words.txt') as f: for w in f: # strip any trailing comment w = re.sub(r'#.*$', '', w) # strip any whitespace w = w.strip() spelldict.add(w) wordlist.append(w.lower()) # XXX: for the moment, to reduce the set of errors, ignore the fact # that words.txt gives a canonical capitalization, and accept any # capitalization spelldict.add(w.lower()) spelldict.add(w.capitalize()) # add all package names as valid words for p in packages: for w in re.split('[_-]', p): # remove punctuation characters w = re.sub(r'[+]', '', w) # strip off any trailing numbers w = re.sub(r'[\d.]*$', '', w) # both with and without any lib prefix for wl in [w, re.sub(r'^lib', '', w)]: # add the package name unless it exists in the list above, which # will give a canonical capitalization if wl.lower() not in wordlist: spelldict.add(wl.lower()) spelldict.add(wl) spelldict.add(wl.capitalize()) # for each package for p in sorted(packages.keys()): # debuginfo packages have uninteresting, auto-generated text which # contains the package name if p.endswith('-debuginfo'): continue # spell-check the spell-checkable keys for k in ['sdesc', 'ldesc', 'message']: if k in packages[p].hints: chkr.set_text(packages[p].hints[k]) # XXX: this is doing all the work to generate suggestions, which # we then ignore, so could be written much more efficiently for err in chkr: # print("package '%s', hint '%s': Is '%s' a word?" % (p, k, err.word)) misspellings.setdefault(err.word, 0) misspellings[err.word] += 1 # summarize for c in sorted(misspellings, key=misspellings.get, reverse=True): print('%16s: %4d' % (c, misspellings[c]))
def test_replace_with_empty_string(): """Testcase for replacing with an empty string (bug #10)""" text = ". I Bezwaar tegen verguning." chkr = SpellChecker("en_US", text) for i, err in enumerate(chkr): err.replace("") assert i < 3 assert chkr.get_text() == ". I ."
def __init__(self, language="en_US"): if not enchant.dict_exists(language): logging.warning("Spelling_Corrector: Don't have {} , Please check it!!!", language) logging.warning("Recommend same language for you: {}", enchant.list_languages()) language = "en_US" self.dict = enchant.Dict(language) self.check = SpellChecker(language) self.tokenizer = get_tokenizer(language)
def spell_checker_featurizer(feature_counter, essay, essay_set=None): chkr = SpellChecker("en_UK", "en_US") chkr.set_text(essay) counter = 0 for error in chkr: counter += 1 #print(counter) feature_counter['spell_checker'] = counter
def correct(text): chkr = SpellChecker("en_US") chkr.set_text(text) for err in chkr: sug = err.suggest() if sug: err.replace(sug[0]) return chkr.get_text()
def retSpErrCount(self): #considers both UK and US dictionary chkr = SpellChecker('en_US', 'en_GB') chkr.set_text(self.file_content) count = 0 for err in chkr: count += 1 return count
def performSpellCorrection(featureObj): checker = SpellChecker("en_US", featureObj.getText()) for word in checker: word.replace(spell(word.word)) featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text()) return featureObj
def test_default_language(): lang = get_default_language() if lang is None: with pytest.raises(DefaultLanguageNotFoundError): SpellChecker() else: checker = SpellChecker() assert checker.lang == lang
def spell_check(response): count = 0 chkr = SpellChecker("en_US") chkr.set_text(response) for err in chkr: if err: count += 1 return count
def spellCheck(text): chkr = SpellChecker("en_US", text) for err in chkr: repls = chkr.suggest(err.word) if len(repls) > 0: repl = repls[0] err.replace(repl) return chkr.get_text()
def spellcheck(sentence): checker = SpellChecker("en_US") checker.set_text(sentence) for error in checker: for suggestion in error.suggest(): if error.word.replace(' ','') == suggestion.replace(' ',''): error.replace(suggestion) break return checker.get_text()
def Errores(self): terrores = 0 texto = open(self.archivo, 'r') chkr = SpellChecker("es_ES") chkr.set_text(texto.readline()) for err in chkr: terrores+=1 terrores=str(terrores) self.ui.errores.setText(terrores)
def spellcheck(self): errs = 0 words = [] chkr = SpellChecker("en_US") chkr.set_text(self.text) for err in chkr: errs += 1 words.append(err.word) return [errs, words]
def has_error(file_path): """return boolean indicating whether the file specified by the file_path contains spelling mistakes """ with open(file_path, "r") as file_to_check: data = checker = SpellChecker("en_US") checker.set_text(data) for err in checker: return True return False
def user_stats(messages, usermap): """Keys: (posts, likes_recieved, likes_given, wordcount, images, misspellings, kicked)""" stats = {} checker = SpellChecker('en_US') for user_id in usermap: stats[usermap[user_id]] = { 'posts': [], 'likes_recieved': 0, 'likes_given': 0, 'wordcount': 0, 'images': 0, 'misspellings': [], 'kicked': 0, 'been_kicked': 0 } current_names = {} # map user id to alias at the time of each message for m in reversed(messages): current_names[m['sender_id']] = m['name'] if m['user_id'] == 'system': if m['text'] is not None: if ' changed name to ' in m['text']: s = m['text'].split(' changed name to ') for uid in current_names: if current_names[uid] == s[0]: current_names[uid] = s[1] elif ' removed ' in m['text']: s = m['text'][:-16].split(' removed ') remover = 0 removed = 0 for uid in current_names: if current_names[uid] == s[0]: remover = uid if current_names[uid] == s[1]: removed = uid if remover != 0 and removed != 0: stats[usermap[remover]]['kicked'] += 1 stats[usermap[removed]]['been_kicked'] += 1 name = usermap[m['sender_id']] stats[name]['posts'].append(m) stats[name]['likes_recieved'] += len(m['favorited_by']) for liker in m['favorited_by']: try: likername = usermap[liker] stats[likername]['likes_given'] += 1 except KeyError: pass stats[name]['images'] += 1 if len(m['attachments']) > 0 else 0 if m['text'] is not None: stats[name]['wordcount'] += len(m['text'].split(' ')) checker.set_text(m['text']) stats[name]['misspellings'] += [error.word for error in list(checker)] del stats['GroupMe'] del stats['GroupMe Calendar'] del stats['Annie Hughey'] del stats['Nicole Vergara'] return stats
def very_rare_long_words(debate, longwords): #takes lemmatised debate and the long words and checks which one has a frequency distribution of 1 fdist = FreqDist(debate) chkr = SpellChecker("en_GB", debate) very_rare = [] for w in longwords: if fdist[w] < 2 and chkr.check(w) == True: very_rare.append(w.encode('utf-8')) return very_rare
def suggest_correction(file_path): """return string representing the spell-corrected content of the file specified by the file_path """ with open(file_path, "r") as file_to_check: data = checker = SpellChecker("en_US") checker.set_text(data) for err in checker: # avoid IndexOutOfBounds err.replace(checker.suggest()[0]) return checker.get_text()
def check_errors(self, langs, text): errors = [] for lang in langs: if lang in enchant.list_languages(): chkr = SpellChecker(lang) chkr.set_text(text) errors_set = set() for err in chkr: errors_set.add(err.word) errors.append(errors_set) return set.intersection(*errors)
def doSpelling(str): chkr = SpellChecker("en_GB"); # any english dictionary chkr.set_text(str); ## we can do more similarity ratio checks as well :P right now pick the best match for err in chkr: try: str = re.sub(err.word,chkr.suggest(err.word)[0],str) except Exception, e: print 'no suggestions: {0}'.format(err.word) continue
def on_data(self, text): data = json.loads(text) for battle_hashtag in self.battle_hashtags: if battle_hashtag.hashtag.value.lower() in data['text'].lower(): checker = SpellChecker('en_GB') checker.set_text(data['text']) for typo in checker: battle_hashtag.typos+=1 battle_hashtag.words += len(data['text'].split(' ')) return True
def long_words(debate): #long words in debate lemmatised_words = debate long_words = [w for w in lemmatised_words if len(w) > 9] long_words = set(long_words) long_words_correct = [] chkr = SpellChecker("en_GB", debate) #spellchecking for word in long_words: if chkr.check(word) == True: long_words_correct.append(word.encode('utf-8')) return long_words_correct #returns list of long words
def average_freqdist(argument, debate): argument = set(argument) arg_length = len(argument) fdist = FreqDist(debate) chkr = SpellChecker("en_GB", debate) frequency = 0 for w in argument: if chkr.check(w) == True: frequency = frequency + fdist[w] average_frequency = float(frequency) / arg_length return average_frequency
def __init__(self) : App.__init__(self) self.width = int(self.config.fetch('page_width')) # regarding adding words to the spell checker, I didn't # want to deal with the overhead of an user interface here # so I simply added a list of words to .config/enchant/en_US.dic # (local dictionary) I think that is what some other UI would # do anyway and now these words are available to other apps. self.chkr = SpellChecker("en_US") # keep a local copy cached to minimize the number of # requests to linkedin and to provide "offline" access # to the most recently downloaded version. # remember to delete this file after you edit your # linkedin profile try : f = open(self.home + '/.jitresume', 'r'); profile =; except IOError : conn = LinkedIn() profile = conn.getProfile() f = open(self.home + '/.jitresume', 'w'); f.write(profile) finally : = cjson.decode(profile) f.close()
def get_tweet_typos(tweet): """ Return a list of typos found in hash_tag string :param hash_tags: :return: """ typos = [] chkr = SpellChecker("en_GB",filters=[EmailFilter,URLFilter]) chkr.set_text(tweet.text) for err in chkr: typos.append(err.word) tweet.typos = json.dumps(typos) return typos
def smart_search(self, spell_check_string): self.spell_check = SpellChecker(self.language) errors = self.search_words(spell_check_string) if not self.decrypted_list.get(errors): self.decrypted_list[errors] = [spell_check_string] else: self.decrypted_list[errors].append(spell_check_string)
def __init__(self, text, dictionary): super(spellCheckerDialog, self).__init__(None, 1) try: if config.main["general"]["language"] == "system": self.checker = SpellChecker() else: self.checker = SpellChecker(languageHandler.getLanguage()) self.checker.set_text(text) except DictNotFoundError: wx.MessageDialog(None, _(u"A bug has happened. There are no dictionaries available for the selected language in TW Blue"), _(u"Error"), wx.ICON_ERROR).ShowModal() self.Destroy() panel = wx.Panel(self) sizer = wx.BoxSizer(wx.VERTICAL) word = wx.StaticText(panel, -1, _(u"Mis-spelled word")) self.word = wx.TextCtrl(panel, -1) wordBox = wx.BoxSizer(wx.HORIZONTAL) wordBox.Add(word) wordBox.Add(self.word) context = wx.StaticText(panel, -1, _(u"Context")) self.context = wx.TextCtrl(panel, -1) contextBox = wx.BoxSizer(wx.HORIZONTAL) contextBox.Add(context) contextBox.Add(self.context) suggest = wx.StaticText(panel, -1, _(u"Suggestions")) self.suggestions = wx.ListBox(panel, -1, choices=[], style=wx.LB_SINGLE) suggestionsBox = wx.BoxSizer(wx.HORIZONTAL) suggestionsBox.Add(suggest) suggestionsBox.Add(self.suggestions) ignore = wx.Button(panel, -1, _(u"Ignore")) self.Bind(wx.EVT_BUTTON, self.onIgnore, ignore) ignoreAll = wx.Button(panel, -1, _(u"Ignore all")) self.Bind(wx.EVT_BUTTON, self.onIgnoreAll, ignoreAll) replace = wx.Button(panel, -1, _(u"Replace")) self.Bind(wx.EVT_BUTTON, self.onReplace, replace) replaceAll = wx.Button(panel, -1, _(u"Replace all")) self.Bind(wx.EVT_BUTTON, self.onReplaceAll, replaceAll) close = wx.Button(panel, wx.ID_CANCEL) btnBox = wx.BoxSizer(wx.HORIZONTAL) btnBox.Add(ignore) btnBox.Add(ignoreAll) btnBox.Add(replace) btnBox.Add(replaceAll) btnBox.Add(close) sizer.Add(wordBox) sizer.Add(contextBox) sizer.Add(suggestionsBox) sizer.Add(btnBox) panel.SetSizerAndFit(sizer) self.check()
def __init__(self, textwidget, **kw): lang = kw.pop('language', 'en_US') filters = kw.pop('filters', [EmailFilter, URLFilter]) chunkers = kw.pop('chunkers', (HTMLChunker,)) self.checker = SpellChecker(lang=lang, filters=filters, chunkers=chunkers) = textwidget'sp_err', background="#CCFB5D", underline=False)
def incorrect_words(request): """ Checks text and returns list of incorrectly spelled words. """ if not settings.SPELLCHECK_ENABLED: return HttpResponseBadRequest(_("spell checking not supported")) text = request.POST.get("text") if not text: return HttpResponseBadRequest(_("text not provided")) chkr = SpellChecker(settings.LANGUAGE_CODE) chkr.set_text(text) errors = [err.word for err in chkr] data = {"data": [errors]} return HttpResponse(json.dumps(data))
def spellchecker(data): tweet = [] for row in data: val = row.split("\t") if val[0] == 'Finding0###': tweet.append(val[1].strip('\n')) tweets = pd.DataFrame() tweets['text'] = [tweet for tweet in tweet] for i in tweets['text']: text = i chkr = SpellChecker("en_US", text) for err in chkr: print("{0}\t{1}".format("Findingres6###",err.word + " at position " + str(err.wordpos))) #<---- err.replace("SPAM") print("Text replaced by spam at wrong spelling or words") t = chkr.get_text() print("\n" + t) #<----
def spellCheck(argument): words = len(argument) abbr = abb1 + abb2 + abb3 #print words errors = 0 chkr = SpellChecker("en_GB", argument) for word in argument: if chkr.check(word) == False: errors+=1 if (chkr.check(word.upper()) == True) or (chkr.check(word.capitalize()) == True) or (chkr.check(word + ".") == True or (word in abbr)): errors-=1 outcome = (float(errors) / words) return outcome