def apply(self, line): tok = hfst.HfstTokenizer() Transducer = hfst.HfstTransducer(self.fallbackTransducer) Transducer.push_weights_to_end() words = hfst.tokenized_fst(tok.tokenize(line)) words.compose(Transducer) words.minimize() return words
def str2fst(str): """ Compile string to FST """ tokenized = tok.tokenize(str) fst = hfst.tokenized_fst(tokenized) return fst
def _compose_block(block, delenv, right_tr, tokenizer): tr = hfst.empty_fst() for word in block: tr.disjunct(hfst.tokenized_fst(tokenizer.tokenize(word))) tr.minimize() tr.compose(delenv) tr.minimize() tr.compose(right_tr) tr.minimize() return tr
def test_tokenized(tok, pathin, pathout, exp, weight=0): tokenized = None if (pathout == None): tokenized = tok.tokenize_one_level(pathin) else: tokenized = tok.tokenize(pathin, pathout) if not hfst.tokenized_fst(tokenized, weight).compare(hfst.regex(exp)): if pathout == None: raise RuntimeError('test_tokenized failed with input: ' + pathin) else: raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
test_tokenized(tok, 'föbär', 'foofö', '[b:f ä:o r:o]') test_tokenized(tok, 'fööbär', 'föbar', '[fööbär:b 0:a 0:r]') test_tokenized(tok, 'föfööfö', 'föföföföö', '[föö]') tok = hfst.HfstTokenizer() tok.add_skip_symbol('?') tok.add_skip_symbol(' ') test_tokenized(tok, 'How is this tokenized?', None, '[H o w i s t h i s t o k e n i z e d]') tok.add_skip_symbol(' is ') test_tokenized(tok, 'How is this tokenized?', None, '[H o w t h i s t o k e n i z e d]') tok = hfst.HfstTokenizer() tok.add_multichar_symbol(hfst.EPSILON) # TODO: should this be included by default??? test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]') if not hfst.tokenized_fst([(hfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(hfst.regex('[0:b f:a o:a o:r]')): raise RuntimeError(get_linenumber()) # Is this ok??? if not hfst.regex('"' + hfst.EPSILON + '"').compare(hfst.regex('[0]')): raise RuntimeError(get_linenumber()) if not hfst.regex('"' + hfst.IDENTITY + '"').compare(hfst.regex('[?]')): raise RuntimeError(get_linenumber()) if not hfst.regex('"' + hfst.UNKNOWN + '":"' + hfst.UNKNOWN + '"').compare(hfst.regex('[?:?]')): raise RuntimeError(get_linenumber()) # other python functions if not hfst.empty_fst().compare(hfst.regex('[0-0]')): raise RuntimeError(get_linenumber()) if not hfst.epsilon_fst().compare(hfst.regex('[0]')): raise RuntimeError(get_linenumber())
raise RuntimeError('') if not hfst.fst(('foo', ('bar',5.0))).compare(hfst.regex('{foo}|{bar}::5.0')): raise RuntimeError('') if not hfst.fst(('foo', ('bar',5.0), 'baz', 'Foo', ('Bar',2.4))).compare(hfst.regex('{foo}|{bar}::5.0|{baz}|{Foo}|{Bar}::2.4')): raise RuntimeError('') if not hfst.fst([('foo',-1), ('bar',0), ('baz',3.5)]).compare(hfst.regex('{foo}::-1|{bar}|{baz}::3.5')): raise RuntimeError('') # A dictionary if not hfst.fst({'foo':'foo', 'bar':('foo',1.4), 'baz':(('foo',-1),'BAZ')}).compare(hfst.regex('{foo}|{bar}:{foo}::1.4|{baz}:{foo}::-1|{baz}:{BAZ}')): raise RuntimeError('') # tokenized_fst tok = hfst.HfstTokenizer() tok.add_multichar_symbol('foo') tok.add_multichar_symbol('bar') tr = hfst.tokenized_fst(tok.tokenize('foobar', 'foobaz')) if not tr.compare(hfst.regex('[foo:foo bar:b 0:a 0:z]')): raise RuntimeError('') # HfstBasicTransducer # Create an empty transducer # The transducer has initially one start state (number zero) # that is not final fsm = hfst.HfstBasicTransducer() # Add two states to the transducer fsm.add_state(1) fsm.add_state(2) # Create a transition [foo:bar] leading to state 1 with weight 0.1 tr = hfst.HfstBasicTransition(1, 'foo', 'bar', 0.1) # and add it to state zero fsm.add_transition(0, tr)
weight = float(line_and_weight[1]) line = line_and_weight[0] tr = None if not pairstrings: input_and_output = line.split(':') if len(input_and_output) == 2: input_and_output[0] = input_and_output[0].strip().rstrip() input_and_output[1] = input_and_output[1].strip().rstrip() if not has_spaces: tr = hfst.fst(input_and_output[0]) tr2 = hfst.fst(input_and_output[1]) tr.cross_product(tr2) else: inputstr = input_and_output[0].split(' ') outputstr = input_and_output[1].split(' ') tr = hfst.tokenized_fst(inputstr) tr2 = hfst.tokenized_fst(outputstr) tr.cross_product(tr2) else: if not has_spaces: tr = hfst.fst(line) else: line = line.split(' ') tr = hfst.tokenized_fst(line) elif has_spaces: line = line.split(' ') symbols = [] for pair in line: symbol_pair = pair.split(':') if len(symbol_pair) == 1: # HFST 4.0: accept also single symbols (identities) symbol_pair.append(symbol_pair[0])