def main(): ret = subprocess.call("./download.sh", shell=True) if ret != 0: return False ret = _install_neologd() if not ret: return False with open(DATA_PATH, 'w') as w: with open('articles.txt', 'r') as f: for line in f: w.write(_process_syntax(line) + '\n') tone_list, lcounter_2gram, word2pos = _create_tone_list() with open(TONE_PATH, 'wb') as w: pickle.dump(tone_list, w, pickle.HIGHEST_PROTOCOL) with open(COUNTER_2GRAM_PATH, 'wb') as w: pickle.dump(lcounter_2gram, w, pickle.HIGHEST_PROTOCOL) with open(WORD2POS_PATH, 'wb') as w: pickle.dump(word2pos, w, pickle.HIGHEST_PROTOCOL) prefix_searcher = trie.DoubleArray(tone_list.keys()) with open(PREFIX_SEARCHER_PATH, 'wb') as w: pickle.dump(prefix_searcher, w, pickle.HIGHEST_PROTOCOL) learner = _train_graph(prefix_searcher, tone_list, lcounter_2gram, word2pos) with open(LEARNER_PATH, 'wb') as w: pickle.dump(learner, w, pickle.HIGHEST_PROTOCOL)
def test_prefix_search_multichars(self): da = trie.DoubleArray([('しゃ', ), ('しゃ', 'か'), ('しゃ', 'か', 'い')]) result = da.prefix_search(['しゃ']) eq_(result, [('しゃ', )]) result = da.prefix_search(['しゃ', 'か']) eq_(result, [('しゃ', ), ('しゃ', 'か')]) result = da.prefix_search(['しゃ', 'か', 'い']) eq_(result, [('しゃ', ), ('しゃ', 'か'), ('しゃ', 'か', 'い')])
def test_prefix_search_case_non_vocabulary(self): da = trie.DoubleArray([('a'), ('a', 'b'), ('a', 'b', 'c')]) result = da.prefix_search(['b']) eq_(result, []) result = da.prefix_search(['a', 'd']) eq_(result, [('a')]) result = da.prefix_search(['a', 'b', 'a']) eq_(result, [('a'), ('a', 'b')])
def test_prefix_search(self): da = trie.DoubleArray([('a'), ('a', 'b'), ('a', 'b', 'c')]) result = da.prefix_search(['a']) eq_(result, [('a')]) result = da.prefix_search(['a', 'b']) eq_(result, [('a'), ('a', 'b')]) result = da.prefix_search(['a', 'b', 'c']) eq_(result, [('a'), ('a', 'b'), ('a', 'b', 'c')])
def test_search_max_len(self): da = trie.DoubleArray([('a', 'd'), ('a', 'b', 'c')]) result = da.search(['a'], max_len=1) eq_(result, []) result = da.search(['a'], max_len=2) eq_(result, [('a', 'd')]) result = da.search(['a'], max_len=3) eq_(result, [('a', 'd'), ('a', 'b', 'c')])
def test_search_case_two_words(self): da = trie.DoubleArray([('a', 'b', 'c'), ('a', 'b', 'd')]) result = da.search(['a']) eq_(result, [('a', 'b', 'c'), ('a', 'b', 'd')]) result = da.search(['a', 'b']) eq_(result, [('a', 'b', 'c'), ('a', 'b', 'd')]) result = da.search(['a', 'b', 'c']) eq_(result, [('a', 'b', 'c')])
def test_search_case_non_result(self): da = trie.DoubleArray([('a', 'b', 'c')]) result = da.search(['a', 'b', 'a']) eq_(result, []) result = da.search(['a', 'c']) eq_(result, []) result = da.search(['b']) eq_(result, [])
('a', 'o'): ['あお', 'かお', 'さと'], ('a', 'o', 'i'): ['あおい', 'さとみ'], ('o', ): ['と'], ('o', 'i'): ['とい', 'こい', 'とし'], ('o', 'i', 'o'): ['たいよ', 'はいりょ'], ('i', ): ['き', 'し'], ('i', 'o'): ['みこ', 'しお'], ('o', 'a'): ['もか', 'もさ'], } word2pos = {} for t in tone_list: for w in tone_list[t]: word2pos[w] = w prefix_searcher = trie.DoubleArray(tone_list.keys()) def test_construct_graph(): g = graph.Graph.construct_graph(prefix_searcher, tone_list, ('a', 'o', 'i', 'o', 'a')) eq_([g.BOS], g.nodes[0]) eq_(set([graph.Node(0, 'か'), graph.Node(0, 'あ'), graph.Node(0, 'さ')]), set(g.nodes[1])) eq_( set([ graph.Node(0, 'あお'), graph.Node(0, 'かお'), graph.Node(0, 'さと'), graph.Node(1, 'と')
def test_create_multichars(self): da = trie.DoubleArray([('しゃ', 'か', 'い')]) eq_(da._base, [0, 0, 0, -1]) eq_(da._check, [-1, 0, 1, 2])
def test_create_contain_common_postfix(self): da = trie.DoubleArray([('a', 'b', 'c'), ('d', 'b', 'c')]) eq_(da._base, [0, 0, 0, -1, 3, 3, -1]) eq_(da._check, [-1, 0, 1, 2, 0, 4, 5])
def test_create(self): da = trie.DoubleArray([('a', 'b', 'c')]) eq_(da._base, [0, 0, 0, -1]) eq_(da._check, [-1, 0, 1, 2])
def test_prefix_search_over_checklen(self): da = trie.DoubleArray([('a', 'b', 'c'), ('a', 'd', 'c')]) result = da.prefix_search(['a', 'd', 'd']) eq_(result, [])