Ejemplo n.º 1
0
 def apply(self, line):
     tok = hfst.HfstTokenizer()
     Transducer = hfst.HfstTransducer(self.fallbackTransducer)
     Transducer.push_weights_to_end()
     words = hfst.tokenized_fst(tok.tokenize(line))
     words.compose(Transducer)
     words.minimize()
     return words
Ejemplo n.º 2
0
def str2fst(str):
    """
	Compile string to FST
	"""

    tokenized = tok.tokenize(str)
    fst = hfst.tokenized_fst(tokenized)
    return fst
Ejemplo n.º 3
0
 def _compose_block(block, delenv, right_tr, tokenizer):
     tr = hfst.empty_fst()
     for word in block:
         tr.disjunct(hfst.tokenized_fst(tokenizer.tokenize(word)))
     tr.minimize()
     tr.compose(delenv)
     tr.minimize()
     tr.compose(right_tr)
     tr.minimize()
     return tr
Ejemplo n.º 4
0
 def test_tokenized(tok, pathin, pathout, exp, weight=0):
     tokenized = None
     if (pathout == None):
         tokenized = tok.tokenize_one_level(pathin)
     else:
         tokenized = tok.tokenize(pathin, pathout)
     if not hfst.tokenized_fst(tokenized, weight).compare(hfst.regex(exp)):
         if pathout == None:
             raise RuntimeError('test_tokenized failed with input: ' + pathin)
         else:
             raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
Ejemplo n.º 5
0
 def test_tokenized(tok, pathin, pathout, exp, weight=0):
     tokenized = None
     if (pathout == None):
         tokenized = tok.tokenize_one_level(pathin)
     else:
         tokenized = tok.tokenize(pathin, pathout)
     if not hfst.tokenized_fst(tokenized, weight).compare(hfst.regex(exp)):
         if pathout == None:
             raise RuntimeError('test_tokenized failed with input: ' + pathin)
         else:
             raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
Ejemplo n.º 6
0
    test_tokenized(tok, 'föbär', 'foofö', '[b:f ä:o r:o]')
    test_tokenized(tok, 'fööbär', 'föbar', '[fööbär:b 0:a 0:r]')
    test_tokenized(tok, 'föfööfö', 'föföföföö', '[föö]')

    tok = hfst.HfstTokenizer()
    tok.add_skip_symbol('?')
    tok.add_skip_symbol(' ')
    test_tokenized(tok, 'How is this tokenized?', None, '[H o w i s t h i s t o k e n i z e d]')
    tok.add_skip_symbol(' is ')
    test_tokenized(tok, 'How is this tokenized?', None, '[H o w t h i s t o k e n i z e d]')

    tok = hfst.HfstTokenizer()
    tok.add_multichar_symbol(hfst.EPSILON) # TODO: should this be included by default???
    test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]')

    if not hfst.tokenized_fst([(hfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(hfst.regex('[0:b f:a o:a o:r]')):
        raise RuntimeError(get_linenumber())

    # Is this ok???
    if not hfst.regex('"' + hfst.EPSILON + '"').compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.IDENTITY + '"').compare(hfst.regex('[?]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.UNKNOWN + '":"' + hfst.UNKNOWN + '"').compare(hfst.regex('[?:?]')):
        raise RuntimeError(get_linenumber())

    # other python functions
    if not hfst.empty_fst().compare(hfst.regex('[0-0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.epsilon_fst().compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
Ejemplo n.º 7
0
            raise RuntimeError('')
        if not hfst.fst(('foo', ('bar',5.0))).compare(hfst.regex('{foo}|{bar}::5.0')):
            raise RuntimeError('')
        if not hfst.fst(('foo', ('bar',5.0), 'baz', 'Foo', ('Bar',2.4))).compare(hfst.regex('{foo}|{bar}::5.0|{baz}|{Foo}|{Bar}::2.4')):
            raise RuntimeError('')
        if not hfst.fst([('foo',-1), ('bar',0), ('baz',3.5)]).compare(hfst.regex('{foo}::-1|{bar}|{baz}::3.5')):
            raise RuntimeError('')
        # A dictionary
        if not hfst.fst({'foo':'foo', 'bar':('foo',1.4), 'baz':(('foo',-1),'BAZ')}).compare(hfst.regex('{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}')):
            raise RuntimeError('')
        
        # tokenized_fst
        tok = hfst.HfstTokenizer()
        tok.add_multichar_symbol('foo')
        tok.add_multichar_symbol('bar')
        tr = hfst.tokenized_fst(tok.tokenize('foobar', 'foobaz'))
        if not tr.compare(hfst.regex('[foo:foo bar:b 0:a 0:z]')):
            raise RuntimeError('')

        # HfstBasicTransducer
        # Create an empty transducer
        # The transducer has initially one start state (number zero)
        # that is not final
        fsm = hfst.HfstBasicTransducer()
        # Add two states to the transducer
        fsm.add_state(1)
        fsm.add_state(2)
        # Create a transition [foo:bar] leading to state 1 with weight 0.1
        tr = hfst.HfstBasicTransition(1, 'foo', 'bar', 0.1)
        # and add it to state zero
        fsm.add_transition(0, tr)
Ejemplo n.º 8
0
    test_tokenized(tok, 'föbär', 'foofö', '[b:f ä:o r:o]')
    test_tokenized(tok, 'fööbär', 'föbar', '[fööbär:b 0:a 0:r]')
    test_tokenized(tok, 'föfööfö', 'föföföföö', '[föö]')

    tok = hfst.HfstTokenizer()
    tok.add_skip_symbol('?')
    tok.add_skip_symbol(' ')
    test_tokenized(tok, 'How is this tokenized?', None, '[H o w i s t h i s t o k e n i z e d]')
    tok.add_skip_symbol(' is ')
    test_tokenized(tok, 'How is this tokenized?', None, '[H o w t h i s t o k e n i z e d]')

    tok = hfst.HfstTokenizer()
    tok.add_multichar_symbol(hfst.EPSILON) # TODO: should this be included by default???
    test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]')

    if not hfst.tokenized_fst([(hfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(hfst.regex('[0:b f:a o:a o:r]')):
        raise RuntimeError(get_linenumber())

    # Is this ok???
    if not hfst.regex('"' + hfst.EPSILON + '"').compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.IDENTITY + '"').compare(hfst.regex('[?]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.UNKNOWN + '":"' + hfst.UNKNOWN + '"').compare(hfst.regex('[?:?]')):
        raise RuntimeError(get_linenumber())

    # other python functions
    if not hfst.empty_fst().compare(hfst.regex('[0-0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.epsilon_fst().compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
Ejemplo n.º 9
0
            raise RuntimeError('')
        if not hfst.fst(('foo', ('bar',5.0))).compare(hfst.regex('{foo}|{bar}::5.0')):
            raise RuntimeError('')
        if not hfst.fst(('foo', ('bar',5.0), 'baz', 'Foo', ('Bar',2.4))).compare(hfst.regex('{foo}|{bar}::5.0|{baz}|{Foo}|{Bar}::2.4')):
            raise RuntimeError('')
        if not hfst.fst([('foo',-1), ('bar',0), ('baz',3.5)]).compare(hfst.regex('{foo}::-1|{bar}|{baz}::3.5')):
            raise RuntimeError('')
        # A dictionary
        if not hfst.fst({'foo':'foo', 'bar':('foo',1.4), 'baz':(('foo',-1),'BAZ')}).compare(hfst.regex('{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}')):
            raise RuntimeError('')
        
        # tokenized_fst
        tok = hfst.HfstTokenizer()
        tok.add_multichar_symbol('foo')
        tok.add_multichar_symbol('bar')
        tr = hfst.tokenized_fst(tok.tokenize('foobar', 'foobaz'))
        if not tr.compare(hfst.regex('[foo:foo bar:b 0:a 0:z]')):
            raise RuntimeError('')

        # HfstBasicTransducer
        # Create an empty transducer
        # The transducer has initially one start state (number zero)
        # that is not final
        fsm = hfst.HfstBasicTransducer()
        # Add two states to the transducer
        fsm.add_state(1)
        fsm.add_state(2)
        # Create a transition [foo:bar] leading to state 1 with weight 0.1
        tr = hfst.HfstBasicTransition(1, 'foo', 'bar', 0.1)
        # and add it to state zero
        fsm.add_transition(0, tr)
Ejemplo n.º 10
0
     weight = float(line_and_weight[1])
     line = line_and_weight[0]
 tr = None
 if not pairstrings:
     input_and_output = line.split(':')
     if len(input_and_output) == 2:
         input_and_output[0] = input_and_output[0].strip().rstrip()
         input_and_output[1] = input_and_output[1].strip().rstrip()
         if not has_spaces:
             tr = hfst.fst(input_and_output[0])
             tr2 = hfst.fst(input_and_output[1])
             tr.cross_product(tr2)
         else:
             inputstr = input_and_output[0].split(' ')
             outputstr = input_and_output[1].split(' ')
             tr = hfst.tokenized_fst(inputstr)
             tr2 = hfst.tokenized_fst(outputstr)
             tr.cross_product(tr2)
     else:
         if not has_spaces:
             tr = hfst.fst(line)
         else:
             line = line.split(' ')
             tr = hfst.tokenized_fst(line)
 elif has_spaces:
     line = line.split(' ')
     symbols = []
     for pair in line:
         symbol_pair = pair.split(':')
         if len(symbol_pair) == 1: # HFST 4.0: accept also single symbols (identities)
             symbol_pair.append(symbol_pair[0])