Esempio n. 1
0
 def test_tokenized(tok, pathin, pathout, exp, weight=0):
     tokenized = None
     if (pathout == None):
         tokenized = tok.tokenize_one_level(pathin)
     else:
         tokenized = tok.tokenize(pathin, pathout)
     if not libhfst.tokenized_fst(tokenized, weight).compare(libhfst.regex(exp)):
         if pathout == None:
             raise RuntimeError('test_tokenized failed with input: ' + pathin)
         else:
             raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
Esempio n. 2
0
        libhfst.regex('{foo}::-1|{bar}|{baz}::3.5')):
    raise RuntimeError('')
# A dictionary
if not libhfst.fst({
        'foo': 'foo',
        'bar': ('foo', 1.4),
        'baz': (('foo', -1), 'BAZ')
}).compare(
        libhfst.regex('{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}')):
    raise RuntimeError('')

# tokenized_fst
tok = libhfst.HfstTokenizer()
tok.add_multichar_symbol('foo')
tok.add_multichar_symbol('bar')
tr = libhfst.tokenized_fst(tok.tokenize('foobar', 'foobaz'))
if not tr.compare(libhfst.regex('[foo:foo bar:b 0:a 0:z]')):
    raise RuntimeError('')

# HfstBasicTransducer
# Create an empty transducer
# The transducer has initially one start state (number zero)
# that is not final
fsm = libhfst.HfstBasicTransducer()
# Add two states to the transducer
fsm.add_state(1)
fsm.add_state(2)
# Create a transition [foo:bar] leading to state 1 with weight 0.1
tr = libhfst.HfstBasicTransition(1, 'foo', 'bar', 0.1)
# and add it to state zero
fsm.add_transition(0, tr)
Esempio n. 3
0
    libhfst.regex("{foo}|{bar}::5.0|{baz}|{Foo}|{Bar}::2.4")
):
    raise RuntimeError("")
if not libhfst.fst([("foo", -1), ("bar", 0), ("baz", 3.5)]).compare(libhfst.regex("{foo}::-1|{bar}|{baz}::3.5")):
    raise RuntimeError("")
# A dictionary
if not libhfst.fst({"foo": "foo", "bar": ("foo", 1.4), "baz": (("foo", -1), "BAZ")}).compare(
    libhfst.regex("{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}")
):
    raise RuntimeError("")

# tokenized_fst
tok = libhfst.HfstTokenizer()
tok.add_multichar_symbol("foo")
tok.add_multichar_symbol("bar")
tr = libhfst.tokenized_fst(tok.tokenize("foobar", "foobaz"))
if not tr.compare(libhfst.regex("[foo:foo bar:b 0:a 0:z]")):
    raise RuntimeError("")

# HfstBasicTransducer
# Create an empty transducer
# The transducer has initially one start state (number zero)
# that is not final
fsm = libhfst.HfstBasicTransducer()
# Add two states to the transducer
fsm.add_state(1)
fsm.add_state(2)
# Create a transition [foo:bar] leading to state 1 with weight 0.1
tr = libhfst.HfstBasicTransition(1, "foo", "bar", 0.1)
# and add it to state zero
fsm.add_transition(0, tr)
Esempio n. 4
0
    test_tokenized(tok, 'föbär', 'foofö', '[b:f ä:o r:o]')
    test_tokenized(tok, 'fööbär', 'föbar', '[fööbär:b 0:a 0:r]')
    test_tokenized(tok, 'föfööfö', 'föföföföö', '[föö]')

    tok = libhfst.HfstTokenizer()
    tok.add_skip_symbol('?')
    tok.add_skip_symbol(' ')
    test_tokenized(tok, 'How is this tokenized?', None, '[H o w i s t h i s t o k e n i z e d]')
    tok.add_skip_symbol(' is ')
    test_tokenized(tok, 'How is this tokenized?', None, '[H o w t h i s t o k e n i z e d]')

    tok = libhfst.HfstTokenizer()
    tok.add_multichar_symbol(libhfst.EPSILON) # TODO: should this be included by default???
    test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]')

    if not libhfst.tokenized_fst([(libhfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(libhfst.regex('[0:b f:a o:a o:r]')):
        raise RuntimeError(get_linenumber())

    # Is this ok???
    if not libhfst.regex('"' + libhfst.EPSILON + '"').compare(libhfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not libhfst.regex('"' + libhfst.IDENTITY + '"').compare(libhfst.regex('[?]')):
        raise RuntimeError(get_linenumber())
    if not libhfst.regex('"' + libhfst.UNKNOWN + '":"' + libhfst.UNKNOWN + '"').compare(libhfst.regex('[?:?]')):
        raise RuntimeError(get_linenumber())

    # other python functions
    if not libhfst.empty_fst().compare(libhfst.regex('[0-0]')):
        raise RuntimeError(get_linenumber())
    if not libhfst.epsilon_fst().compare(libhfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
Esempio n. 5
0
import libhfst
tok = libhfst.HfstTokenizer()
tok.add_multichar_symbol('foo')
tok.add_multichar_symbol('bar')
tr = libhfst.tokenized_fst(tok.tokenize('foobar', 'foobaz'))
print(tr)