def train_data(): count = 0 with open(DATA_PATH, 'r') as f: for line in f: if random.random() > 0.01: continue count += 1 line = line.strip() words = list( filter(lambda w: w.strip() != '', line.split('\t'))) words = list( filter( lambda w: all( any(c in t for t in tone.tone_types.values()) for c in w.split()[1]), words)) if len(words) == 0: continue t = [] ws = [] for w in words: tones, kanas = tone.convert_tones(w.split()[1]) if len(tones) == 0: continue tones[-1] = kanas[-1] t.extend(tones) ws.append(w.split()[0]) if len(t) != 0: yield t, ws
def _create_tone_list(): """Return tone to string dictionary. Return: tone_list (Hash[String, List[String]]): tone to string dictionary. """ def train_data(): with open(DATA_PATH, 'r') as f: for line in f: line = line.strip() words = line.split('\t') words = filter(lambda w: w.strip() != '', words) yield from words def train_data_2gram(): g = graph.Graph() with open(DATA_PATH, 'r') as f: for line in f: line = line.strip() words = line.split('\t') words = list(filter(lambda w: w.strip() != '', words)) words = [' '.join([g.BOS.word] * 3) ] + words + [' '.join([g.EOS.word] * 3)] yield from [ words[i].split()[2] + '_' + words[i + 1].split()[2] for i in range(len(words) - 1) ] lcounter = counter.LossyCounter(epsilon=1e-6) lcounter.count(train_data()) lcounter_2gram = counter.LossyCounter(epsilon=1e-7) lcounter_2gram.count(train_data_2gram()) print(len(lcounter._items)) print(len(lcounter_2gram._items)) tone_list = {} word2pos = {} count = 0 for w in lcounter._items: chars = w.split()[1] tones, kana = tone.convert_tones(chars) if len(tones) == 0: count += 1 continue if len(tones) != len(kana): count += 1 continue word = w.split()[0] if word not in word2pos: word2pos[word] = w.split()[2] for t in _mix_tone_and_kana(tones, kana): if t not in tone_list: tone_list[t] = [] tone_list[t].append(word) print("Remove Count:", count) print('Total Count:', sum(1 for t in tone_list for l in tone_list[t])) return tone_list, lcounter_2gram, word2pos
def _process_syntax(line): """Return kana and pronounce list for input line. Args: line (String): parse target strings Return: result (String): kana1 + ' ' + pronounce1 + '\t' + kana2 + ' ' + pronounce2 ... sforamted string. """ mecab.tagger = mecab.MeCab.Tagger("-d ./lib") sentence = mecab.parse(line) result = [] ret_kana = "" ret_pronounce = "" ret_pos = "" for word in sentence.words: if word.pos == '記号': continue if len(tone.convert_tones(word.pronounce)[0]) == 0: result.append(ret_kana + " " + ret_pronounce + " " + ret_pos) ret_kana = "" ret_pronounce = "" ret_pos = "" continue elif ret_pos.startswith('動詞') and word.pos not in [ '接続詞', '形容詞', '動詞', '名詞', '連体詞', '副詞' ]: ret_kana += word.surface ret_pronounce += word.pronounce if ret_pos == '': ret_pos = word.pos else: result.append(ret_kana + " " + ret_pronounce + " " + ret_pos) ret_kana = word.surface ret_pronounce = word.pronounce if word.pos == '名詞': ret_pos = word.pos + '-' + word.pos1 elif word.pos == '助詞': ret_pos = word.pos + '-' + word.pos1 elif word.pos == '動詞': ret_pos = word.pos + '-' + word.pos1 else: ret_pos = word.pos result.append(ret_kana + " " + ret_pronounce + " " + ret_pos) return "\t".join(r for r in result if r != "")
def get_match_word(yomi, tone_list): """Return tone match words to word. Aarg: yomi (str): target word yomi. tone_list (str): tone dictionary. Return: words (List[String]): match word list. """ tones, _ = tone.convert_tones(yomi) tones = "".join(tones) distances = [(max(measure_tail_match_num(tones, t), measure_initial_match_num(tones, t)), t) for t in tone_list if len(tones) == len(t)] distance = sorted(distances, key=lambda x: x[0], reverse=True)[0] return tone_list[distance[1]]
def get_match_word_with_searcher(yomi, tone_list, prefix_searcher): """Return tone match words with common prefix searcher. Aarg: yomi (str): target word yomi. tone_list (str): tone dictionary. prefix_searcher (TrieBase): Trie prefix searcher class Return: words (List[String]): match word list. """ tones, _ = tone.convert_tones(yomi) N = len(tones) result = [] while len(tones) != 0: result = prefix_searcher.search(tones, max_len=N) if len(result) != 0: break tones = tones[:-1] if len(result) == 0: return [] return tone_list[random.choice(result)]
def generate_rapv2(s, tone_list, prefix_searcher, learner, N=1): """Return generated rap. Aarg: s (String): target sentence. tone_list (Hash[Tuple[String], List[String]]): string to string dictionary. prefix_searcher (TrieBase): Trie Prefix Searcher class learner (StructuredLearner): pre-trained structured learner N (Int): response numbers. Return: rap (List[String]): generated rap """ t = [] is_last_tone = False for w in reversed(mecab.parse(s).words): tones, kana = tone.convert_tones(w.pronounce) if len(tones) == 0: continue if not is_last_tone: tones[-1] = kana[-1] else: is_last_tone = False if (w.pos != '名詞' and w.pos != '形容詞' and w.pos != '動詞') and len(tones) == 1: is_last_tone = True t += reversed(tones) t = list(reversed(t)) g = graph.Graph.construct_graph(prefix_searcher, tone_list, t) g.learner = learner try: if N != 1: paths = g.search_nbest_path(N) else: paths = [g.search_shortest_path()] except graph.SearchShortestPathError: return "" return ["".join(p.word for p in path[:-1]) for path in paths]
def testconvert_tones(): eq_((['e', 'i', 'a', 'i'], ['セ', 'イ', 'タ', 'イ']), tone.convert_tones('セイタイ')) eq_((['a', 'u', 'xtu'], ['ヤ', 'ブ', 'ッ']), tone.convert_tones('ヤブッ')) eq_((['a', 'i', 'o', 'u', 'i', 'a'], ['カ', 'リ', 'フォ', 'ル', 'ニ', 'ア']), tone.convert_tones('カリフォルニア')) eq_((['o', 'u', 'e', 'u'], ['チョ', 'ウ', 'セ', 'ツ']), tone.convert_tones('チョウセツ')) eq_((['o', 'o', 'a', 'e', 'i', 'o', 'u'], ['ロ', 'ー', 'マ', 'テ', 'イ', 'コ', 'ク']), tone.convert_tones('ローマテイコク')) eq_(([], []), tone.convert_tones('、')) eq_((['a', 'n', 'a', 'n'], ['カ', 'ン', 'タ', 'ン']), tone.convert_tones('カンタン')) eq_((['a', 'a', 'a'], ['カ', 'ラ', 'ー']), tone.convert_tones('カラー')) eq_(([], []), tone.convert_tones('ー')) eq_((['a', 'n', 'a', 'i', 'a'], ['ヴァ', 'ン', 'パ', 'イ', 'ア']), tone.convert_tones('ヴァンパイア')) eq_((['o'], ['ョ']), tone.convert_tones('ョ'))
def testconvert_tones_error_case(): eq_(([], []), tone.convert_tones('aaaa'))