Esempio n. 1
0
def test_compose_token_and_lexicon_fst(workdir, words_without_homophones):
    vocab = get_vocabulary_table(workdir, words_without_homophones)
    lexicon = get_lexicon(words_without_homophones)

    phoneme_table = PhonemeTable()
    phoneme_table.add_labels(phonemes)

    lexicon_fst = lexicon.create_fst(phoneme_table, vocab)

    token = Token()
    token_fst = token.create_fst(phoneme_table)

    fst = pywrapfst.compose(token_fst.arcsort('olabel'), lexicon_fst)
    fst = pywrapfst.determinize(fst)
Esempio n. 2
0
def test_compose_token_and_lexicon_fst_with_homophones(workdir,
                                                       words_with_homophones):
    vocab = get_vocabulary_table(workdir, words_with_homophones)
    lexicon = get_lexicon(words_with_homophones)

    phoneme_table = PhonemeTable()
    phoneme_table.add_labels(phonemes)

    lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0)

    token = Token()
    token_fst = token.create_fst(phoneme_table)

    fst = pywrapfst.compose(token_fst.arcsort('olabel'), lexicon_fst)
    with pytest.raises(pywrapfst.FstOpError):
        pywrapfst.determinize(fst)
Esempio n. 3
0
def test_wfst_decoder_decode(workdir, words_for_corpus_with_homophones):
    corpus_path = os.path.join(workdir, 'corpus.txt')
    create_corpus(corpus_path, words_for_corpus_with_homophones)

    vocab_path = os.path.join(workdir, 'vocab.syms')
    vocab = create_vocabulary_symbol_table(vocab_path, corpus_path)

    phoneme_table = PhonemeTable()
    phoneme_table.add_labels(phonemes)

    lexicon = get_lexicon(words_for_corpus_with_homophones)
    lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0)

    token = Token()
    token_fst = token.create_fst(phoneme_table)

    grammar_path = os.path.join(workdir, 'grammar.fst')
    grammar = Grammar()
    grammar_fst = grammar.create_fst(grammar_path, vocab_path, corpus_path)

    wfst_decoder = WFSTDecoder()
    wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst)

    blank_id = phoneme_table.get_blank_id()
    a = phoneme_table.get_label_id('a')
    i = phoneme_table.get_label_id('i')
    d = phoneme_table.get_label_id('d')
    e = phoneme_table.get_label_id('e')
    s = phoneme_table.get_label_id('s')
    o = phoneme_table.get_label_id('o')
    m = phoneme_table.get_label_id('m')
    r = phoneme_table.get_label_id('r')
    u = phoneme_table.get_label_id('u')
    frame_labels = [
        blank_id, blank_id, a, a, i, i, i, d, e, blank_id, s, s, o, o, o, m, e,
        r, r, u
    ]
    got = wfst_decoder.decode(frame_labels, vocab)
    assert got == '藍で染める'
Esempio n. 4
0
def test_wfst_decoder_create_fst(workdir, words_for_corpus_without_homophones):
    corpus_path = os.path.join(workdir, 'corpus.txt')
    create_corpus(corpus_path, words_for_corpus_without_homophones)

    vocab_path = os.path.join(workdir, 'vocab.syms')
    vocab = create_vocabulary_symbol_table(vocab_path, corpus_path)

    phoneme_table = PhonemeTable()
    phoneme_table.add_labels(phonemes)

    lexicon = get_lexicon(words_for_corpus_without_homophones)
    lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0)

    token = Token()
    token_fst = token.create_fst(phoneme_table)

    grammar_path = os.path.join(workdir, 'grammar.fst')
    grammar = Grammar()
    grammar_fst = grammar.create_fst(grammar_path, vocab_path, corpus_path)

    wfst_decoder = WFSTDecoder()
    wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst)
Esempio n. 5
0
def test_token_create_fst():
    phoneme_table = PhonemeTable()
    phoneme_table.add_labels(['a', 'i'])
    epsilon_id = phoneme_table.get_epsilon_id()
    blank_id = phoneme_table.get_blank_id()
    a = phoneme_table.get_label_id('a')
    i = phoneme_table.get_label_id('i')

    fst = Token().create_fst(phoneme_table)
    assert (fst.num_states() == 5)
    # start state
    state = 0
    assert (fst.num_arcs(state) == 3)
    gen_arc = fst.arcs(state)
    is_expected_arc(next(gen_arc), blank_id, epsilon_id, state)
    is_expected_arc(next(gen_arc), a, a, 3)
    is_expected_arc(next(gen_arc), i, i, 4)
    # second state
    state = 1
    assert (fst.num_arcs(state) == 2)
    gen_arc = fst.arcs(state)
    is_expected_arc(next(gen_arc), blank_id, epsilon_id, state)
    is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 2)
    # final(auxiliary) state
    state = 2
    assert (fst.num_arcs(state) == 1)
    gen_arc = fst.arcs(state)
    is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 0)
    # a
    state = 3
    assert (fst.num_arcs(state) == 2)
    gen_arc = fst.arcs(state)
    is_expected_arc(next(gen_arc), a, epsilon_id, state)
    is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 1)
    # b
    state = 4
    assert (fst.num_arcs(state) == 2)
    gen_arc = fst.arcs(state)
    is_expected_arc(next(gen_arc), i, epsilon_id, state)
    is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 1)
Esempio n. 6
0
phoneme_table.add_labels(phonemes)
print('Creating vocabulary symbol ...')
corpus_path = os.path.join(args.workdir, args.corpus_file)
vocabulary_symbol_path = os.path.join(args.workdir,
                                      args.vocabulary_symbol_file)
VocabularySymbolTable.create_symbol(vocabulary_symbol_path, corpus_path)
vocabulary_symbol_table = VocabularySymbolTable.load_symbol(
    vocabulary_symbol_path)
print('Creating lexicon FST ...')
lexicon_path = os.path.join(args.workdir, args.lexicon_file)
lexicon_fst_filepath = os.path.join(args.workdir, args.lexicon_fst_file)
lexicon = Lexicon()
lexicon.load(lexicon_path)
lexicon_fst = lexicon.create_fst(phoneme_table, vocabulary_symbol_table)
lexicon_fst.write(lexicon_fst_filepath)
print('Creating token FST ...')
token_fst_path = os.path.join(args.workdir, args.token_fst_file)
token = Token()
token_fst = token.create_fst(phoneme_table)
token_fst.write(token_fst_path)
print('Creating grammar FST ...')
grammar_fst_path = os.path.join(args.workdir, args.grammar_fst_file)
grammar = Grammar()
grammar_fst = grammar.create_fst(grammar_fst_path, vocabulary_symbol_path,
                                 corpus_path)
print('Creating decoder ...')
decoder_fst_path = os.path.join(args.workdir, args.decoder_fst_file)
wfst_decoder = WFSTDecoder()
wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst)
wfst_decoder.write_fst(decoder_fst_path)