def edits1(word): splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [a + b[1:] for a, b in splits if b] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1] replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] inserts = [a + c + b for a, b in splits for c in alphabet] return set(deletes + transposes + replaces + inserts) def datafile(name, sep='\t'): """Read key,value pairs from file.""" for line in file(name): yield line.split(sep) edit_frequencies = WordFrequency.from_freq_file('Norvig/edits/count_1edit.txt') p_spell_error = 1./20. def p_edit(edit): """The probability of an edit; can be '' or 'a|b' """ if edit == '': return 1. - p_spell_error return p_spell_error * edit_frequencies.get_probability(edit) PREFIXES = set(w[:i] for w in Pw for i in range(len(w) + 1)) def edits(word, dictionary, d=2):
from Context import Context from EditDistance import edits from WordFrequency import WordFrequency import re import Readers count_1w = WordFrequency.from_freq_file("data/Norvig/wordfreqs/count_1w.txt") count_2w = WordFrequency.from_freq_file("data/Norvig/wordfreqs/count_2w.txt") def corrections(text): "Spell-correct all words in text." return re.sub('[a-zA-Z]+', lambda m: correct(m.group(0)), text) def correct(context): "Return the word that is the most likely spell correction of w." candidates = edits(context.word()).items() #c, edit = max(candidates, key=lambda (c,e): Pedit(e) * Pw(c)) #return c for context in Context.gen_context_sequence_from_word_sequence(Readers.gen_words_from_file("doc.txt")): correct(context)