def suggestions(word): spell = SpellChecker() errorList = {} if not word.isupper() and not any(char.isdigit() for char in word): candidateWords = spell.candidates(word) errorList[word] = { "correction": max(candidateWords, key=spell.word_probability), "suggestion": spell.candidates(word) } return errorList
def test_candidates(self): ''' test spell checker candidates ''' spell = SpellChecker() cands = {'tes', 'tps', 'th', 'thi', 'tvs', 'tds', 'tbs', 'bhs', 'thf', 'chs', 'tis', 'thes', 'tls', 'tho', 'thu', 'thr', 'dhs', "th'", 'thus', 'ts', 'ehs', 'tas', 'ahs', 'thos', 'thy', 'tcs', 'nhs', 'the', 'tss', 'hs', 'lhs', 'vhs', "t's", 'tha', 'whs', 'ghs', 'rhs', 'this'} self.assertEqual(spell.candidates('ths'), cands) self.assertEqual(spell.candidates('the'), {'the'}) self.assertEqual(spell.candidates('-'), {'-'})
def test_candidates(self): ''' test spell checker candidates ''' spell = SpellChecker() cands = {'tes', 'thas', 'tis', 'thse', 'thes', 'thus', 'ohs', 'thu', 'thy', 'thi', 'tas', 'tus', 'thos', 'ahs', 'tho', 'tha', 'thsi', 'tos', 'the', 'this', 'iths'} self.assertEqual(spell.candidates('ths'), cands) self.assertEqual(spell.candidates('the'), {'the'}) self.assertEqual(spell.candidates('-'), {'-'}) # something that cannot exist... should return just the same element... self.assertEqual(spell.candidates('manasaeds'), {'manasaeds'})
def spellCorrect(word): if len(word) < 8: spell = SpellChecker() else: spell = SpellChecker(distance=2) #if word is mispelled if spell[word] == 0: # Get a list of `likely` options print(spell.candidates(word)) return spell.candidates(word)
def test_candidates(self): ''' test spell checker candidates ''' spell = SpellChecker() cands = {'tes', 'tps', 'th', 'thi', 'tvs', 'tds', 'tbs', 'bhs', 'thf', 'chs', 'tis', 'thes', 'tls', 'tho', 'thu', 'thr', 'dhs', "th'", 'thus', 'ts', 'ehs', 'tas', 'ahs', 'thos', 'thy', 'tcs', 'nhs', 'the', 'tss', 'hs', 'lhs', 'vhs', "t's", 'tha', 'whs', 'ghs', 'rhs', 'this'} self.assertEqual(spell.candidates('ths'), cands) self.assertEqual(spell.candidates('the'), {'the'}) self.assertEqual(spell.candidates('-'), {'-'}) # something that cannot exist... should return just the same element... self.assertEqual(spell.candidates('manasaeds'), {'manasaeds'})
class SpellChecking(): def __init__(self, stc, lm_scorer ): #st means sentence which need be checked by spellcheker self.stc = stc self.lm_scorer = lm_scorer self.checker = SpellChecker() def puncRemove(self, sentence): tranTemp = str.maketrans({key: None for key in string.punctuation}) tgtSentence = sentence.translate(tranTemp) return tgtSentence def errorFind(self, sentence): #tgtSentence = self.puncRemove(sentence) tokenList = sentence.split(' ') posList = [] #Store the position of wrong words' positions numList = [] #Store the number of wrong words' candidates for tokenIndex in range(len(tokenList)): if self.checker.correction( tokenList[tokenIndex] ) != tokenList[tokenIndex]: #Checking if this word is right posList.append( tokenIndex ) #will be replaced by method searching for all candidates by language model numList.append( len(self.checker.candidates(tokenList[tokenIndex]))) return (posList, numList) def suggest(self, sentence): (posList, numList) = self.errorFind(sentence) lenInt = len(sentence.split(' ')) for index in range(len(numList)): candiList = [] lenNewInt = len(sentence.split(' ')) lenGapInt = lenNewInt - lenInt #print(posList, numList) if lenGapInt != 0: lenInt = lenNewInt for pos in range(len(posList)): posList[pos] += lenGapInt for num in range(numList[index]): tokenList = sentence.split(' ') tokenList[posList[index]] = list( self.checker.candidates(tokenList[posList[index]]))[num] newSentence = ' '.join(tokenList) candiList.append(newSentence) # Store candidates scoreDict = self.lm_scorer.score(candiList) # Score sentences maxInt = max(scoreDict, key=scoreDict.get) #find the max sentence = candiList[maxInt] return sentence
def spell_check(self, query): """ This method checks for spelling errors in the query, and updates any misspelt word.It uses the python library 'pyspellchecker' :param query: query is a list of words which may contain misspelt words. :return: It returns a list which contains correct words in query as well as the incorrect words which have now been modified. """ spell = SpellChecker() misspelled = spell.unknown(query) # list of misspelled words in query #print(misspelled) for i in range(0, len(query)): if query[i] in misspelled: candidate_found = False for candidate in spell.candidates( query[i]): # possible candidates if candidate in docFreq: query[i] = candidate candidate_found = True break if candidate_found is False: query[i] = spell.correction( query[i]) # Get the one `most likely` answer #print(query) # query with correct spellings return query
def check_spelling(word_list): """#returns a dictionary that contains the possible misspelled word and suggestions""" try: from spellchecker import SpellChecker except ModuleNotFoundError as _: while 1: choice = input("Fixing '{}' for you by installing SpellChecker.\n \ do you agree? [Y/N]\n".format(_)) if choice.lower() == "n": sys.exit(0) elif choice.lower() == "y": subprocess.call("pip install pyspellchecker") from spellchecker import SpellChecker break else: print("Not a valid choice.") spell = SpellChecker(distance=1) misspelled = spell.unknown(word_list) misspelled_dict = {} for word in misspelled: misspelled_dict[word] = spell.candidates(word) print("\nWord :: Suggestions\n") for key, value in misspelled_dict.items(): print(f"{key} :: {value}") print()
def spellcorrect(document): Token.set_extension('spelling_correction', default=None, force=True) Token.set_extension('spelling_candidates', default=None, force=True) spell = SpellChecker() violations = [] for word in document: ignore_regex = r"(“|”|’|\n|\t|\\n|\\t| |'s|’s|n't|n’t)" misspelled = (not bool(re.match( ignore_regex, word.text))) & (len(spell.unknown([word.text])) > 0) if misspelled: word._.spelling_correction = spell.correction(word.text) word._.spelling_candidates = spell.candidates(word.text) violations.append({ 'rule_id': 'spellCorrect', 'word': word.text, 'positions': [[word.idx, word.idx + len(word.text)]], 'correction': word._.spelling_correction, 'candidates': list(word._.spelling_candidates) }) return violations
class SpellCheckerML: def __init__(self): self.spell_checker = SpellChecker() self.autocomplete = autocomplete self.autocomplete.load() def train(self, text, model_name=''): if model_name == '': self.autocomplete.models.train_models(text, model_name=False) else: self.autocomplete.models.train_models(text, model_name=model_name) self.autocomplete.load() def correction(self, previous_word, word): if self.spell_checker.known([word]): return word else: spell_checker_candidates = self.spell_checker.candidates(word) autocomplete_predictions = self.autocomplete.predict(previous_word, word[0]) autocomplete_candidates = [elem[0] for elem in autocomplete_predictions] best_choices = [] for candidate in spell_checker_candidates: try: candidate_index = autocomplete_candidates.index(candidate) best_choices.append(autocomplete_predictions[candidate_index]) except: continue if best_choices: best_choices = sorted(best_choices, key=lambda t:t[1]) return list(best_choices[-1])[0] else: return random.choice(list(spell_checker_candidates))
def pyspell_dict(input_dict, name): """ Adds suggested corrections to provided dictionary :param input_dict: input dictionary template :param name: name of dataset for use in file output write :return: Updated dictionary with suggestions and candidates, name of file written to """ file_name = name + "_pyspell_dict.txt" spell = SpellChecker() target_words = pickle.load( open('../spelling_mistakes/target_words_all.txt', 'rb')) misspelled = spell.unknown(target_words) spell.word_frequency.load_words(misspelled) working_dict = dict(input_dict) counter = 0 for misspelling, details in working_dict.items(): if counter % 100 == 0: print(counter) working_dict[misspelling]['suggested'] = spell.correction(misspelling) working_dict[misspelling]['candidates'] = list( spell.candidates(misspelling)) counter += 1 return working_dict, file_name
def correct_instructions(instruction): # The directives get checked and corrected using spellchecker distance ? # The strings get corrected to their closest words in the corpus # "TAGE 1 TAGLET" becomes "TAKE 1 TABLET" spell = SpellChecker() spell.distance = 3 keywords = [ 'every', 'once', 'twice', 'thrice', 'daily', 'hours', 'hour', 'day', 'days', 'weeks', 'morning', 'afternoon', 'night', 'one', 'tablet' ] # Tokenize the directive token_instruction = instruction.split() # Spell Check and correct all the elements of the list close_words = [spell.candidates(token) for token in token_instruction] # Correct any words that are close in distance to the keywords for counter, token in enumerate(token_instruction): for keyword in close_words[counter]: if keyword in keywords: token_instruction[counter] = keyword # General spell correction for counter, token in enumerate(token_instruction): if token.isalpha(): token_instruction[counter] = spell.correction(token) # Make the tokens uppercase token_instruction = [token.upper() for token in token_instruction] # print(close_words) return token_instruction
class SC: def __init__(self): self.spell = SpellChecker() def check(self, word): res = {} misspelled = self.spell.unknown([word]) for word in misspelled: print("Oops...You may spell the wrong word. Here're possible candidates:") print("\t"+", ".join(self.spell.candidates(word))) # print(type(self.spell.candidates(word))) res["error"] = "Oops...You may spell the wrong word." res["candi"] = list(self.spell.candidates(word)) return False, res return True, res
def locationSpellCheck(location): finalStr = "" # Each word in location for sep in location: spell = SpellChecker() misspelled = spell.unknown([sep]) # If word is correctly spelled if len(misspelled) == 0: finalStr = finalStr + sep + " " # For each candidate in misspelled for word in misspelled: # Get the most likely answer candidateCorrection = spell.correction(word) # Get a list of likely options candidates = spell.candidates(word) # If no candidates were returned indicating an invalid word if len(candidates) == 1 and location == candidateCorrection: return -1 else: finalStr = finalStr + candidateCorrection + " " return finalStr
class SpellcheckFilter(AbstractFilter): """Uses PySpellChecker in order to attempt to create spellchecked candidates.""" NAME = "spellcheck" spellchecker = None def __init__(self): self.spellchecker = SpellChecker() def info(self): """Display information about this filter.""" return """Spellcheck Filter Takes a candidate line and feeds it into PySpellChecker (https://pypi.org/project/pyspellchecker/) in an attempt to generate potential word candidates based upon Levenshtein distance. This is similar to wordlist expansion done by iphelix's PACK. """ def filter_line(self, line): """Filter a single line of data.""" misspelled = self.spellchecker.unknown(line) candidates = [] for word in misspelled: candidates.append(self.spellchecker.candidates(word)) return candidates
def test_edit_distance_one(self): ''' test a case where edit distance must be one ''' here = os.path.dirname(__file__) filepath = '{}/resources/small_dictionary.json'.format(here) spell = SpellChecker(language=None, local_dictionary=filepath, distance=1) self.assertEqual(spell.candidates('hike'), {'bike'})
def non_pronounceable_id(self, identifier): spell = SpellChecker() misspelled_parts = spell.unknown(identifier.parts) if misspelled_parts is not None: print('{0} misspelled part found in the name'.format(len(misspelled_parts))) for misspelled_word in misspelled_parts: print(str(id), 'misspelled part in name:', misspelled_word) print('\tcandidates', spell.candidates(misspelled_word))
def read_sentence(): print("@ ", end="") raw_command = input().strip().split(" ") command = [re.sub(r'\W+', '', i).lower() for i in raw_command] spell = SpellChecker() spell_command = [] for i in command: spell_command.extend(spell.candidates(i)) return (command, raw_command, spell_command)
class SpellingSuggestor(object): def __init__(self, word): self.word = word self.spell = SpellChecker() """"method to replace underscore or dash by space""" def pre_process(self): return re.sub(r'([^\s\w]|_|-)+', ' ', self.word) """method to remove letters which occur more than twice""" def reduce_lengthening(self): pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", self.word) "main method to call and do spell check processing " def spell_checker_result(self): self.word = self.pre_process() self.word = self.reduce_lengthening().lower() i = 1 print("word after cleaning ", self.word) misspelled = self.spell.unknown([self.word]) if len(misspelled) == 0: return [self.word] result = set() while (i < len(self.word)): r1 = self.spell.candidates(self.word[:i].strip()) r2 = self.spell.candidates(self.word[i].strip()) r1 = self.spell.known(r1) r2 = self.spell.known(r2) if len(r1) > 0 and len(r2) > 0: try: for v1 in r1: result.add(v1) for v2 in r2: if len(v2) > 2: result.add(v2) result.add(v1 + " " + v2) except Exception as ex: print("some error", ex) i += 1 return result
def run(targTrainFile, targetOutputFile=""): if targetOutputFile == "": targetOutputFile = os.path.dirname(os.path.abspath(__file__)) + \ "/../data/spellCorrections.txt" spell = SpellChecker() allText = "" with open(targTrainFile, mode='r') as input: allText = input.read() allData = {} lines = allText.split("\n") for line in lines: print(line) exit(0) if re.search('^,.*$', line): continue arr = line.split(",") word = arr[0] if len(word) == 0: continue allData[word] = line finalOutput = "" wordsSeen = 0 for key in allData: wordsSeen += 1 if re.search('^[\d\.]+$', key): continue misspelled = spell.unknown([key]) for word in misspelled: trans = spell.correction(word) if word == trans: # Drop from allData, it's incorrect without replacement finalOutput += "del " + word + "\n" #print(word) elif trans in allData: #print(word+" => "+trans) finalOutput += key + ":" + trans + "\n" else: #print("Many") for option in spell.candidates(word): if option in allData: finalOutput += key + ":" + option + "\n" break if wordsSeen % 10000 == 0: print(wordsSeen) with open(targetOutputFile, mode="w") as output: output.write(finalOutput) output.close()
def demo_spellchecker(): """演示如何使用spellchecker库 官方介绍文档 pyspellchecker · PyPI: https://pypi.org/project/pyspellchecker/ 190909周一15:58,from 陈坤泽 """ # 1 创建对象 # 可以设置语言、大小写敏感、拼写检查的最大距离 # 默认'en'英语,大小写不敏感 spell = SpellChecker() # 如果是英语,SpellChecker会自动加载语言包site-packages\spellchecker\resources\en.json.gz,大概12万个词汇,包括词频权重 d = spell.word_frequency # 这里的d是WordFrequency对象,其底层用了Counter类进行数据存储 dprint(d.unique_words, d.total_words) # 词汇数,权重总和 # 2 修改词频表 spell.word_frequency dprint(d['ckz']) # 不存在的词汇直接输出0 d.add('ckz') # 可以添加ckz词汇的一次词频 d.load_words(['ckz', 'ckz', 'lyb']) # 可以批量添加词汇 dprint(d['ckz'], d['lyb']) # d['ckz']=3 d['lyb']=1 d.load_words(['ckz'] * 100 + ['lyb'] * 500) # 可以用这种技巧进行大权重的添加 dprint(d['ckz'], d['lyb']) # d['ckz']=103 d['lyb']=501 # 同理,去除也有remove和remove_words两种方法 d.remove('ckz') # d.remove_words(['ckz', 'lyb']) # 不过注意不能删除已经不存在的key('ckz'),否则会报KeyError dprint(d['ckz'], d['lyb']) # d['ckz']=0 d['lyb']=501 # remove是完全去除单词,如果只是要减权重可以访问底层的_dictionary对象操作 d._dictionary['lyb'] -= 100 # 当然不太建议直接访问下划线开头的成员变量~~ dprint(d['lyb']) # ['lyb']=401 # 还可以按阈值删除词频不超过设置阈值的词汇 d.remove_by_threshold(5) # 3 spell的基本功能 # (1)用unknown可以找到可能拼写错误的单词,再用correction可以获得最佳修改意见 misspelled = spell.unknown(['something', 'is', 'hapenning', 'here']) dprint(misspelled) # misspelled<set>={'hapenning'} for word in misspelled: # Get the one `most likely` answer dprint(spell.correction(word)) # <str>='happening' # Get a list of `likely` options dprint(spell.candidates( word)) # <set>={'henning', 'happening', 'penning'} # 注意默认的spell不区分大小写,如果词库存储了100次'ckz' # 此时判断任意大小写形式组合的'CKZ'都是返回原值 # 例如 spell.correction('ckZ') => 'ckZ' # (2)可以通过修改spell.word_frequency影响correction的计算结果 dprint(d['henning'], d['happening'], d['penning']) # d['henning']<int>=53 d['happening']<int>=4538 d['penning']<int>=23 d._dictionary['henning'] += 10000 dprint(spell.correction('hapenning')) # <str>='henning' # (3)词汇在整个字典里占的权重 dprint(spell.word_probability('henning')) # <float>=0.0001040741914298211
def word_check2(s): spell_corrector = SpellChecker() for word in s: # Get the one `most likely` answer if word != spell_corrector.correction(word): print("Word incorrectly spelt:" + word) print("Most likely: " + spell_corrector.correction(word)) # Get a list of `likely` options print("Other possibilities: " + str(spell_corrector.candidates(word)))
def spell_check(extracted): checker = SpellChecker() words = extracted.split() alternatives = {} for word in words: if not word in alternatives and checker.correction(word) != word: alternatives[word] = checker.candidates(word) return alternatives
def get_correct(word): if not word or isinstance(word, bool): return None spell = SpellChecker() result = spell.candidates(word) if spell.correction(word) == word: return None else: return result
def index(): correct = "" a = "" list1 = [] list2 = [] misspelled = [] error = "" if request.method == "POST": print("FORM DATA RECEIVED") if "file" not in request.files: flash('No file part') return redirect(request.url) file = request.files["file"] if file.filename == "": flash('No selected file') return redirect(request.url) if file.filename != '': file_ext = os.path.splitext(file.filename)[1] if file_ext not in app.config['UPLOAD_EXTENSIONS']: print("Please upload a .txt file type only") return abort(400) if file: file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename)) print('C:/Users/' + username + '/Desktop/' + file.filename) f = open('C:/Users/' + username + '/Desktop/' + file.filename, "r+") filecontent = f.read() a = str(filecontent) b = TextBlob(a) correct = str(b.correct()) # remove all punctuations before finding possible misspelled words s = re.sub(r'[^\w\s]', '', filecontent) #print("Text without punctuations:\n", s) wordlist = s.split() spell = SpellChecker() # find those words that may be misspelled misspelled = list(spell.unknown(wordlist)) for word in misspelled: # Get the one `most likely` answer list1.append(spell.correction(word)) # Get a list of `likely` options list2.append(spell.candidates(word)) return render_template('index.html', a=a, correct=correct, misspelled=misspelled, list1=list1, list2=list2, len=len(misspelled), len1=len(list1))
def checkspelling(wordList): spell = SpellChecker(distance=1) misspelled = spell.unknown(wordList) misspelledDict = {} #print(type(misspelled)) #print("Possible Misspelled Words:\n{}".format(misspelled)) for word in misspelled: misspelledDict[word] = spell.candidates(word) print("Word : Suggestions") for key, value in misspelledDict.items(): print(f"{key} :: {value}")
def test_capitalization_when_case_sensitive_true(self): ''' test that capitalization affects comparisons ''' spell = SpellChecker(language=None, case_sensitive=True) spell.word_frequency.add('Bob') self.assertEqual('Bob' in spell, True) self.assertEqual('BOb' in spell, False) self.assertEqual('BOB' in spell, False) self.assertEqual('bob' in spell, False) words = ['Bb', 'bb', 'BB'] self.assertEqual(spell.unknown(words), {'Bb', 'bb', 'BB'}) case_variant_words = ['BOB', 'bOb'] self.assertEqual(spell.known(case_variant_words), set()) self.assertEqual(spell.candidates('Bb'), {'Bob'}) self.assertEqual(spell.candidates('bob'), {'Bob'}) self.assertEqual(spell.correction('Bb'), 'Bob') self.assertEqual(spell.correction('bob'), 'Bob') self.assertEqual(spell.unknown(['bob']), {'bob'})
def check(word): global spellings_to_check spell = SpellChecker() word = word.lower() for possible in spell.candidates(word): if possible in spellings_to_check: return possible return word
def spell_correct(query_str): spell = SpellChecker() # find those words that may be misspelled list_of_words = get_list(query_str) new_str = "" for word in list_of_words: # Get the one `most likely` answer new_str += str((" " + spell.correction(word))) # Get a list of `likely` options print(word + ' ->', spell.candidates(word)) print("Did you mean " + "'" + new_str + "'" + "?") return new_str
class Spell_Searcher: def __init__(self, indexer): self._indexer = indexer self.spell = None def query_expansion(self, query): """ This function finds a misspelled word and finds its closest similarity. first by tracking all of its candidates. the candidate with the most appearances in the inverted index will be the "replaced" :param query: query dictionary :return: query dictionary with replaced correct words. """ try: self.spell = SpellChecker(local_dictionary='spell_dict.json', distance=1) except: pass query_dict = query.query_dict for term in query_dict: if term.lower() not in self._indexer.inverted_idx and term.upper() not in self._indexer.inverted_idx: misspelled_checker = self.spell.unknown([term]) if len(misspelled_checker) != 0: candidates = list(self.spell.edit_distance_1(term)) super_candidates = list(self.spell.candidates(term)) candidates.extend(super_candidates) max_freq_in_corpus = 0 max_freq_name = '' for i, candidate in enumerate(candidates): if candidate in self._indexer.inverted_idx: curr_freq = self._indexer.inverted_idx[candidate] if curr_freq > max_freq_in_corpus: max_freq_in_corpus = curr_freq max_freq_name = candidate elif candidate.upper() in self._indexer.inverted_idx: curr_freq = self._indexer.inverted_idx[candidate.upper()] if curr_freq > max_freq_in_corpus: max_freq_in_corpus = curr_freq max_freq_name = candidate if max_freq_name != '': print(max_freq_name) query_dict[max_freq_name] = query_dict.pop(term) else: continue
#!/usr/bin/env python ''' pip install pyspellchecker ''' from spellchecker import SpellChecker spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(['let', 'us', 'wlak','on','the','groun']) for word in misspelled: # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word))