class SpellChecker(): def __init__(self, channel_model=None, language_model=None, max_distance): self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.train(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): return (self.language_model.bigram_score(prev_word,focus_word) + self.language_model.bigram_score(focus_word,next_word))/2 def unigram_score(self, word): return self.language_model.unigram_score(word): def cm_score(self, error_word, corrected_word): return self.channel_model.align(error_word,corrected_word)[0] @staticmethod def isSubstring(w1, w2): for letter in w1: try: w2 = w2[w2.index(letter):] except: return False return True def inserts(self, word): output = [] for w in self.language_model: if len(w) == len(word) + 1: if isSubstring(word, w): output.append(w) return output def deletes(self, word): output = [] for w in self.language_model: if len(w) == len(word) - 1: if isSubstring(w,word): output.append(w) return output def substitutions(self, word): output = [] for w in self.language_model: if len(w) == len(word): numInc = 0 for i in range(len(w)): if w[i] != word[i]: numInc += 1 if numInc == 1: output.append(w) return output def generate_candidates(self, word): output = [word] for _ in range(self.max_distance): newOutput = [] for w in output: newOutput += self.inserts(word) + self.deletes(word) + self.substitutions(word) output = newOutput return output def check_non_words(self, sentence, fallback=False): output = [] for word in sentence: if word in self.language_model: output.append([word]) else: L = self.generate_candidates(word) if fallback && len(L) == 0: output.append([word]) else: L.sort(key=lambda w: self.language_model.unigram_score(w) + self.channel_model.align(w)[0]) output.append(L) return output def check_sentence(self, sentence, fallback=False): return self.check_non_words(sentence, fallback) def check_text def check_sentence(self, sentence, fallback=False):
elif current_index == 0: # start of sentence bigram_score = self.bigram_score("<s>", words[i], sentence[current_index + 1]) else: bigram_score = self.bigram_score(sentence[current_index - 1], words[i], sentence[current_index + 1]) scores.append(self.cm_score(error_word, words[i]) + 0.5 * bigram_score + 0.5 * self.unigram_score(words[i])) i += 1 return_list.append(scores) current_index += 1 return list_of_words, return_list if __name__ == "__main__": lm = AddAlphaBigramModel(alpha=.1) lm.train() cm = EditDistanceFinder() cm.train("wikipedia_misspellings.txt") s = SpellChecker(cm, lm, 1) print (s.check_sentence(["how", "are", "yoo", "sir"])) print () print (s.check_sentence(["they", "did", "not", "yb", "any", "menas"])) print () print (s.autocorrect_sentence(["they", "did", "not", "yb", "any", "menas"])) print () print (s.autocorrect_sentence(["menas"])) print () print (s.__score_probabilities__(["they", "did", "not", "yb", "any", "menas"])) print () print (s.suggest_sentence(["they", "did", "not", "yb", "any", "menas"], 3)) print () print (s.generate_candidates("hi"))