def test_compose_token_and_lexicon_fst(workdir, words_without_homophones): vocab = get_vocabulary_table(workdir, words_without_homophones) lexicon = get_lexicon(words_without_homophones) phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) lexicon_fst = lexicon.create_fst(phoneme_table, vocab) token = Token() token_fst = token.create_fst(phoneme_table) fst = pywrapfst.compose(token_fst.arcsort('olabel'), lexicon_fst) fst = pywrapfst.determinize(fst)
def test_compose_token_and_lexicon_fst_with_homophones(workdir, words_with_homophones): vocab = get_vocabulary_table(workdir, words_with_homophones) lexicon = get_lexicon(words_with_homophones) phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0) token = Token() token_fst = token.create_fst(phoneme_table) fst = pywrapfst.compose(token_fst.arcsort('olabel'), lexicon_fst) with pytest.raises(pywrapfst.FstOpError): pywrapfst.determinize(fst)
def test_wfst_decoder_decode(workdir, words_for_corpus_with_homophones): corpus_path = os.path.join(workdir, 'corpus.txt') create_corpus(corpus_path, words_for_corpus_with_homophones) vocab_path = os.path.join(workdir, 'vocab.syms') vocab = create_vocabulary_symbol_table(vocab_path, corpus_path) phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) lexicon = get_lexicon(words_for_corpus_with_homophones) lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0) token = Token() token_fst = token.create_fst(phoneme_table) grammar_path = os.path.join(workdir, 'grammar.fst') grammar = Grammar() grammar_fst = grammar.create_fst(grammar_path, vocab_path, corpus_path) wfst_decoder = WFSTDecoder() wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst) blank_id = phoneme_table.get_blank_id() a = phoneme_table.get_label_id('a') i = phoneme_table.get_label_id('i') d = phoneme_table.get_label_id('d') e = phoneme_table.get_label_id('e') s = phoneme_table.get_label_id('s') o = phoneme_table.get_label_id('o') m = phoneme_table.get_label_id('m') r = phoneme_table.get_label_id('r') u = phoneme_table.get_label_id('u') frame_labels = [ blank_id, blank_id, a, a, i, i, i, d, e, blank_id, s, s, o, o, o, m, e, r, r, u ] got = wfst_decoder.decode(frame_labels, vocab) assert got == '藍で染める'
def test_wfst_decoder_create_fst(workdir, words_for_corpus_without_homophones): corpus_path = os.path.join(workdir, 'corpus.txt') create_corpus(corpus_path, words_for_corpus_without_homophones) vocab_path = os.path.join(workdir, 'vocab.syms') vocab = create_vocabulary_symbol_table(vocab_path, corpus_path) phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) lexicon = get_lexicon(words_for_corpus_without_homophones) lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0) token = Token() token_fst = token.create_fst(phoneme_table) grammar_path = os.path.join(workdir, 'grammar.fst') grammar = Grammar() grammar_fst = grammar.create_fst(grammar_path, vocab_path, corpus_path) wfst_decoder = WFSTDecoder() wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst)
def test_token_create_fst(): phoneme_table = PhonemeTable() phoneme_table.add_labels(['a', 'i']) epsilon_id = phoneme_table.get_epsilon_id() blank_id = phoneme_table.get_blank_id() a = phoneme_table.get_label_id('a') i = phoneme_table.get_label_id('i') fst = Token().create_fst(phoneme_table) assert (fst.num_states() == 5) # start state state = 0 assert (fst.num_arcs(state) == 3) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), blank_id, epsilon_id, state) is_expected_arc(next(gen_arc), a, a, 3) is_expected_arc(next(gen_arc), i, i, 4) # second state state = 1 assert (fst.num_arcs(state) == 2) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), blank_id, epsilon_id, state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 2) # final(auxiliary) state state = 2 assert (fst.num_arcs(state) == 1) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 0) # a state = 3 assert (fst.num_arcs(state) == 2) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), a, epsilon_id, state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 1) # b state = 4 assert (fst.num_arcs(state) == 2) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), i, epsilon_id, state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 1)
phoneme_table.add_labels(phonemes) print('Creating vocabulary symbol ...') corpus_path = os.path.join(args.workdir, args.corpus_file) vocabulary_symbol_path = os.path.join(args.workdir, args.vocabulary_symbol_file) VocabularySymbolTable.create_symbol(vocabulary_symbol_path, corpus_path) vocabulary_symbol_table = VocabularySymbolTable.load_symbol( vocabulary_symbol_path) print('Creating lexicon FST ...') lexicon_path = os.path.join(args.workdir, args.lexicon_file) lexicon_fst_filepath = os.path.join(args.workdir, args.lexicon_fst_file) lexicon = Lexicon() lexicon.load(lexicon_path) lexicon_fst = lexicon.create_fst(phoneme_table, vocabulary_symbol_table) lexicon_fst.write(lexicon_fst_filepath) print('Creating token FST ...') token_fst_path = os.path.join(args.workdir, args.token_fst_file) token = Token() token_fst = token.create_fst(phoneme_table) token_fst.write(token_fst_path) print('Creating grammar FST ...') grammar_fst_path = os.path.join(args.workdir, args.grammar_fst_file) grammar = Grammar() grammar_fst = grammar.create_fst(grammar_fst_path, vocabulary_symbol_path, corpus_path) print('Creating decoder ...') decoder_fst_path = os.path.join(args.workdir, args.decoder_fst_file) wfst_decoder = WFSTDecoder() wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst) wfst_decoder.write_fst(decoder_fst_path)