def words_train(in_file, word_file, graph_file): # Can only use a exists word_file to make graph_file if in_file is not None: #Auto create words from in_file file_size = os.path.getsize(in_file) word_dict = None if file_size > 3*1024*1024: # A little more quick but inaccurate than WordDict1 word_dict = WordDict2() print >> sys.stderr, 'please wait, getting words from file', in_file else: word_dict = WordDict1() with codecs.open(in_file, 'r', 'utf-8') as file: for line in file: for sentence in re_line.split(line): word_dict.learn(sentence) word_dict.learn_flush() print >> sys.stderr, 'get all words, save word to file', word_file word_dict.save_to_file(word_file) print >> sys.stderr, 'save all words completely, create word graphp', graph_file words = [] with codecs.open(word_file,'r','utf-8') as file: for line in file: tokens = line.split(" ") if len(tokens) >= 2: words.append(tokens[0].strip()) words = sorted(words) whoosh_spelling.wordlist_to_graph_file(words, graph_file) print >> sys.stderr, 'words_train ok'
def test_find_self(): wordlist = sorted(u("book bake bike bone").split()) st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(wordlist, f) gr = fst.GraphReader(st.open_file("test")) gc = spelling.GraphCorrector(gr) assert gc.suggest("book")[0] != "book" assert gc.suggest("bake")[0] != "bake" assert gc.suggest("bike")[0] != "bike" assert gc.suggest("bone")[0] != "bone"
def test_find_self(): wordlist = sorted(u("book bake bike bone").split()) st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(wordlist, f) gr = fst.GraphReader(st.open_file("test")) gc = spelling.GraphCorrector(gr) assert gc.suggest("book")[0] != "book" assert gc.suggest("bake")[0] != "bake" assert gc.suggest("bike")[0] != "bike" assert gc.suggest("bone")[0] != "bone"
def words_to_corrector(words): st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(words, f) f = st.open_file("test") return spelling.GraphCorrector(fst.GraphReader(f))
from whoosh import spelling from whoosh.filedb.filestore import FileStorage wordfile = open("/Users/amckenzie/Documents/data/scowl-7.1/final/english-words.60") # Use a Storage object to get a file to write the graph into st = FileStorage("/Users/amckenzie/Tools/pythonScripts") f = st.create_file("wordgraph") # Write a graph of the words into the file spelling.wordlist_to_graph_file(wordfile, f) # Create a graph reader from the file and wrap it with a corrector f = st.open_file("wordgraph") gr = fst.GraphReader(f) cor = spelling.GraphCorrector(gr) # See docs for whoosh.spelling.Corrector.suggest() cor.suggest("aple")
def words_to_corrector(words): st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(words, f) f = st.open_file("test") return spelling.GraphCorrector(fst.GraphReader(f))
def _create_word_graph_file(name, file_storage, word_set): """Create a word graph file and open it in memory.""" word_graph_file = file_storage.create_file(name) spelling.wordlist_to_graph_file(sorted(list(word_set)), word_graph_file) return copy_to_ram(file_storage).open_file(name)