def symbol_pair_to_fst(insym, outsym): """"Return a FST which accepts one the pair string 'insym:outsym'""" bfst = hfst.HfstBasicTransducer() string_pair_path = ((insym, outsym)) bfsa.disjunct(string_pair_path, 0) fst = hfst.fst(bfst) return (fst)
def symbol_to_fsa(sym): """Return a FSA which accepts the one letter string 'sym' The symbol 'sym' may be e.g. a composed Unicode grapheme, i.e. a string of two or more Unicode characters. """ bfsa = hfst.HfstBasicTransducer() string_pair_path = ((sym, sym)) bfsa.disjunct(string_pair_path, 0) fsa = hfst.fst(bfsa) return (fsa)
def main(): """Invoke a simple CLI analyser.""" argp = ArgumentParser() argp.add_argument('-a', '--analyser', metavar='FSA', required=True, help="Path to FSA analyser") argp.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data in CONLLU") options = argp.parse_args() analyser = load_analyser(options.analyser) sentence = hfst.epsilon_fst() if not options.infile: options.infile = stdin for line in options.infile: line = line.strip() if not line or line == '': print("@SENTENCE_SEPARATOR@") elif line.startswith('#'): print(line) else: refs = line.strip().split('\t') anals = analyse(analyser, refs[1]) if anals: lattice = hfst.empty_fst() for anal in anals: surf = refs[1] deep = anal[0] weight = anal[1] print(surf, deep) bleh = hfst.fst({surf: deep}) lattice.disjunct(bleh) sentence.concatenate(lattice) else: surf = refs[1] deep = refs[1] + "|NOUN|Case=Nom|Number=Sing|Guess=Yes|nsubj" print(surf, deep) bleh = hfst.fst({surf: deep}) sentence.concatenate(bleh) print("@TOKEN SEPARATOR@") foo = hfst.fst("@TOKEN_SEPARATOR@") sentence.concatenate(foo) exit(0)
def align_two_words(in_word, out_word, aligner_fst, zero, number): w1 = hfst.fst(in_word) w1.insert_freely((zero, zero)) w1.minimize() ###print(w1) w2 = hfst.fst(out_word) w2.insert_freely((zero, zero)) w2.minimize() ###print(w2) w3 = hfst.HfstTransducer(w1) w3.compose(aligner_fst) w3.compose(w2) ###print(w1) w3.n_best(number) w3.minimize() ###print(w3) raw_paths = w3.extract_paths(output='raw') if cfg.verbosity >= 10: print("raw_paths:", raw_paths) return raw_paths
def shuffle_with_zeros(string, target_length): """Return a fsa where zeros are inserted in all possible ways string -- the string to which zero symbols are inserted target_length -- how long the strings after insertions must be Returns a fsa which accepts all the strings with the inserted zeros. All strings have exactly target_length symbols. """ result_fsa = hfst.fst(string) l = len(string) if l < target_length: n = target_length - l n_zeros_fsa = hfst.regex(' '.join(n * 'Ø')) result_fsa.shuffle(n_zeros_fsa) result_fsa.minimize() result_fsa.set_name(string) if cfg.verbosity >= 30: print("shuffle_with_zeros:") print(result_fsa) return result_fsa
def to_fst(self) -> hfst.HfstTransducer: return hfst.fst(self.symstr)
def test_fst(input, result): tr1_ = hfst.fst(input) tr2_ = hfst.regex(result) if not tr1_.compare(tr2_): raise RuntimeError('test_fst failed with input: ' + input)
f.close() # Create automaton: # unweighted test_fst('foobar', '[f o o b a r]') test_fst(['foobar'], '[f o o b a r]') test_fst(['foobar', 'foobaz'], '[f o o b a [r|z]]') # with weights test_fst(('foobar', 0.3), '[f o o b a r]::0.3') test_fst([('foobar', 0.5)], '[f o o b a r]::0.5') test_fst(['foobar', ('foobaz', -2)], '[ f o o b a [r|[z::-2]] ]') # Special inputs test_fst('*** FOO ***', '{*** FOO ***}') foo = hfst.fst('') eps = hfst.epsilon_fst() assert(foo.compare(eps)) #try: # foo = hfst.fst('') # raise RuntimeError(get_linenumber()) #except RuntimeError as e: # if not e.__str__() == 'Empty word.': # raise RuntimeError(get_linenumber()) # Create transducer: # unweighted test_fst({'foobar':'foobaz'}, '[f o o b a r:z]') test_fst({'foobar':['foobar','foobaz']}, '[f o o b a [r|r:z]]') test_fst({'foobar':('foobar','foobaz')}, '[f o o b a [r|r:z]]') test_fst({'foobar':'foobaz', 'FOOBAR':('foobar','FOOBAR'), 'Foobar':['Foo','bar','Foobar']}, '[f o o b a r:z] | [F O O B A R] | [F:f O:o O:o B:b A:a R:r] | [F o o b:0 a:0 r:0] | [F:b o:a o:r b:0 a:0 r:0] | [F o o b a r]')
# TransducerTypeMismatchException: if hfst.ImplementationType.FOMA_TYPE in types: hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE) tr1 = hfst.regex('foo') tr2 = hfst.regex('bar') tr2.convert(hfst.ImplementationType.FOMA_TYPE) try: tr1.disjunct(tr2) except hfst.exceptions.TransducerTypeMismatchException: print('The implementation types of transducers must be the same.') hfst.set_default_fst_type(type) # fst # One unweighted identity path: if not hfst.fst('foo').compare(hfst.regex('{foo}')): raise RuntimeError('') # Weighted path: a tuple of string and number, e.g. if not hfst.fst(('foo',1.4)).compare(hfst.regex('{foo}::1.4')): raise RuntimeError('') if not hfst.fst(('bar',-3)).compare(hfst.regex('{bar}::-3')): raise RuntimeError('') if not hfst.fst(('baz',0)).compare(hfst.regex('{baz}')): raise RuntimeError('') # Several paths: a list or a tuple of paths and/or weighted paths, e.g. if not hfst.fst(['foo', 'bar']).compare(hfst.regex('{foo}|{bar}')): raise RuntimeError('') if not hfst.fst(('foo', ('bar',5.0))).compare(hfst.regex('{foo}|{bar}::5.0')): raise RuntimeError('') if not hfst.fst(('foo', ('bar',5.0), 'baz', 'Foo', ('Bar',2.4))).compare(hfst.regex('{foo}|{bar}::5.0|{baz}|{Foo}|{Bar}::2.4')): raise RuntimeError('')
for line in istr: line = line.rstrip() weight = None line_and_weight = line.split('\t') if len(line_and_weight) == 2: weight = float(line_and_weight[1]) line = line_and_weight[0] tr = None if not pairstrings: input_and_output = line.split(':') if len(input_and_output) == 2: input_and_output[0] = input_and_output[0].strip().rstrip() input_and_output[1] = input_and_output[1].strip().rstrip() if not has_spaces: tr = hfst.fst(input_and_output[0]) tr2 = hfst.fst(input_and_output[1]) tr.cross_product(tr2) else: inputstr = input_and_output[0].split(' ') outputstr = input_and_output[1].split(' ') tr = hfst.tokenized_fst(inputstr) tr2 = hfst.tokenized_fst(outputstr) tr.cross_product(tr2) else: if not has_spaces: tr = hfst.fst(line) else: line = line.split(' ') tr = hfst.tokenized_fst(line) elif has_spaces:
f.close() # Create automaton: # unweighted test_fst('foobar', '[f o o b a r]') test_fst(['foobar'], '[f o o b a r]') test_fst(['foobar', 'foobaz'], '[f o o b a [r|z]]') # with weights test_fst(('foobar', 0.3), '[f o o b a r]::0.3') test_fst([('foobar', 0.5)], '[f o o b a r]::0.5') test_fst(['foobar', ('foobaz', -2)], '[ f o o b a [r|[z::-2]] ]') # Special inputs test_fst('*** FOO ***', '{*** FOO ***}') try: foo = hfst.fst('') raise RuntimeError(get_linenumber()) except RuntimeError as e: if not e.__str__() == 'Empty word.': raise RuntimeError(get_linenumber()) # Create transducer: # unweighted test_fst({'foobar': 'foobaz'}, '[f o o b a r:z]') test_fst({'foobar': ['foobar', 'foobaz']}, '[f o o b a [r|r:z]]') test_fst({'foobar': ('foobar', 'foobaz')}, '[f o o b a [r|r:z]]') test_fst( { 'foobar': 'foobaz', 'FOOBAR': ('foobar', 'FOOBAR'), 'Foobar': ['Foo', 'bar', 'Foobar']
import sys, hfst algfile = hfst.HfstInputStream("chardist.fst") align = algfile.read() for line in sys.stdin: lst = line.strip().split(sep=":") if len(lst) == 2: f1,f2 = lst else: f1,f2 = lst[0],lst[0] w1 = hfst.fst(f1) w1.insert_freely(("Ø","Ø")) w1.minimize() # print(w1) w2 = hfst.fst(f2) w2.insert_freely(("Ø","Ø")) w2.minimize() # print(w2) w3 = hfst.HfstTransducer(w1) w3.compose(align) w3.compose(w2) # print(w1) w3.n_best(1) w3.minimize() paths = w3.extract_paths(output='text') print(paths.strip())