def test_tokenized(tok, pathin, pathout, exp, weight=0): tokenized = None if (pathout == None): tokenized = tok.tokenize_one_level(pathin) else: tokenized = tok.tokenize(pathin, pathout) if not libhfst.tokenized_fst(tokenized, weight).compare(libhfst.regex(exp)): if pathout == None: raise RuntimeError('test_tokenized failed with input: ' + pathin) else: raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
libhfst.regex('{foo}::-1|{bar}|{baz}::3.5')): raise RuntimeError('') # A dictionary if not libhfst.fst({ 'foo': 'foo', 'bar': ('foo', 1.4), 'baz': (('foo', -1), 'BAZ') }).compare( libhfst.regex('{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}')): raise RuntimeError('') # tokenized_fst tok = libhfst.HfstTokenizer() tok.add_multichar_symbol('foo') tok.add_multichar_symbol('bar') tr = libhfst.tokenized_fst(tok.tokenize('foobar', 'foobaz')) if not tr.compare(libhfst.regex('[foo:foo bar:b 0:a 0:z]')): raise RuntimeError('') # HfstBasicTransducer # Create an empty transducer # The transducer has initially one start state (number zero) # that is not final fsm = libhfst.HfstBasicTransducer() # Add two states to the transducer fsm.add_state(1) fsm.add_state(2) # Create a transition [foo:bar] leading to state 1 with weight 0.1 tr = libhfst.HfstBasicTransition(1, 'foo', 'bar', 0.1) # and add it to state zero fsm.add_transition(0, tr)
libhfst.regex("{foo}|{bar}::5.0|{baz}|{Foo}|{Bar}::2.4") ): raise RuntimeError("") if not libhfst.fst([("foo", -1), ("bar", 0), ("baz", 3.5)]).compare(libhfst.regex("{foo}::-1|{bar}|{baz}::3.5")): raise RuntimeError("") # A dictionary if not libhfst.fst({"foo": "foo", "bar": ("foo", 1.4), "baz": (("foo", -1), "BAZ")}).compare( libhfst.regex("{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}") ): raise RuntimeError("") # tokenized_fst tok = libhfst.HfstTokenizer() tok.add_multichar_symbol("foo") tok.add_multichar_symbol("bar") tr = libhfst.tokenized_fst(tok.tokenize("foobar", "foobaz")) if not tr.compare(libhfst.regex("[foo:foo bar:b 0:a 0:z]")): raise RuntimeError("") # HfstBasicTransducer # Create an empty transducer # The transducer has initially one start state (number zero) # that is not final fsm = libhfst.HfstBasicTransducer() # Add two states to the transducer fsm.add_state(1) fsm.add_state(2) # Create a transition [foo:bar] leading to state 1 with weight 0.1 tr = libhfst.HfstBasicTransition(1, "foo", "bar", 0.1) # and add it to state zero fsm.add_transition(0, tr)
test_tokenized(tok, 'föbär', 'foofö', '[b:f ä:o r:o]') test_tokenized(tok, 'fööbär', 'föbar', '[fööbär:b 0:a 0:r]') test_tokenized(tok, 'föfööfö', 'föföföföö', '[föö]') tok = libhfst.HfstTokenizer() tok.add_skip_symbol('?') tok.add_skip_symbol(' ') test_tokenized(tok, 'How is this tokenized?', None, '[H o w i s t h i s t o k e n i z e d]') tok.add_skip_symbol(' is ') test_tokenized(tok, 'How is this tokenized?', None, '[H o w t h i s t o k e n i z e d]') tok = libhfst.HfstTokenizer() tok.add_multichar_symbol(libhfst.EPSILON) # TODO: should this be included by default??? test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]') if not libhfst.tokenized_fst([(libhfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(libhfst.regex('[0:b f:a o:a o:r]')): raise RuntimeError(get_linenumber()) # Is this ok??? if not libhfst.regex('"' + libhfst.EPSILON + '"').compare(libhfst.regex('[0]')): raise RuntimeError(get_linenumber()) if not libhfst.regex('"' + libhfst.IDENTITY + '"').compare(libhfst.regex('[?]')): raise RuntimeError(get_linenumber()) if not libhfst.regex('"' + libhfst.UNKNOWN + '":"' + libhfst.UNKNOWN + '"').compare(libhfst.regex('[?:?]')): raise RuntimeError(get_linenumber()) # other python functions if not libhfst.empty_fst().compare(libhfst.regex('[0-0]')): raise RuntimeError(get_linenumber()) if not libhfst.epsilon_fst().compare(libhfst.regex('[0]')): raise RuntimeError(get_linenumber())
import libhfst tok = libhfst.HfstTokenizer() tok.add_multichar_symbol('foo') tok.add_multichar_symbol('bar') tr = libhfst.tokenized_fst(tok.tokenize('foobar', 'foobaz')) print(tr)