def fsa(arg): """ Get a transducer (automaton in this case) that recognizes one or more paths. Parameters ---------- * `arg` : See example below Possible inputs: One unweighted identity path: 'foo' -> [f o o] Weighted path: a tuple of string and number, e.g. ('foo',1.4) ('bar',-3) ('baz',0) Several paths: a list or a tuple of paths and/or weighted paths, e.g. ['foo', 'bar'] ('foo', ('bar',5.0)) ('foo', ('bar',5.0), 'baz', 'Foo', ('Bar',2.4)) [('foo',-1), ('bar',0), ('baz',3.5)] """ deftok = HfstTokenizer() retval = HfstBasicTransducer() if isinstance(arg, str): retval.disjunct(deftok.tokenize(_check_word(arg)), 0) elif _is_weighted_word(arg): retval.disjunct(deftok.tokenize(_check_word(arg[0])), arg[1]) elif isinstance(arg, tuple) or isinstance(arg, list): for word in arg: if _is_weighted_word(word): retval.disjunct(deftok.tokenize(_check_word(word[0])), word[1]) elif isinstance(word, str): retval.disjunct(deftok.tokenize(_check_word(word)), 0) else: raise RuntimeError('Tuple/list element not a string or tuple of string and weight.') else: raise RuntimeError('Not a string or tuple/list of strings.') return HfstTransducer(retval, get_default_fst_type())
def read_att_string(att): """ Create a transducer as defined in AT&T format in *att*. """ linecount = 0 fsm = HfstBasicTransducer() lines = att.split('\n') for line in lines: linecount = linecount + 1 if not _parse_att_line(line, fsm): raise hfst.exceptions.NotValidAttFormatException(line, "", linecount) return HfstTransducer(fsm, get_default_fst_type())
def fsa(arg): """ Get a transducer (automaton in this case) that recognizes one or more paths. Parameters ---------- * `arg` : See example below Possible inputs: One unweighted identity path: 'foo' -> [f o o] Weighted path: a tuple of string and number, e.g. ('foo',1.4) ('bar',-3) ('baz',0) Several paths: a list or a tuple of paths and/or weighted paths, e.g. ['foo', 'bar'] ('foo', ('bar',5.0)) ('foo', ('bar',5.0), 'baz', 'Foo', ('Bar',2.4)) [('foo',-1), ('bar',0), ('baz',3.5)] """ deftok = HfstTokenizer() retval = HfstBasicTransducer() if isinstance(arg, str): retval.disjunct(deftok.tokenize(_check_word(arg)), 0) elif _is_weighted_word(arg): retval.disjunct(deftok.tokenize(_check_word(arg[0])), arg[1]) elif isinstance(arg, tuple) or isinstance(arg, list): for word in arg: if _is_weighted_word(word): retval.disjunct(deftok.tokenize(_check_word(word[0])), word[1]) elif isinstance(word, str): retval.disjunct(deftok.tokenize(_check_word(word)), 0) else: raise RuntimeError('Tuple/list element not a string or tuple of string and weight.') else: raise RuntimeError('Not a string or tuple/list of strings.') return HfstTransducer(retval, get_default_fst_type())
def read_prolog_transducer(f, linecount=[0]): """ Create a transducer as defined in prolog format in file *f*. *linecount* keeps track of the current line in the file. """ linecount_ = 0 fsm = HfstBasicTransducer() line = "" while (True): line = f.readline() linecount_ = linecount_ + 1 if line == "": raise hfst.exceptions.EndOfStreamException( "", "", linecount[0] + linecount_) line = line.rstrip() if line == "": pass # allow extra prolog separator(s) if line[0] == '#': pass # comment line else: break if not libhfst.parse_prolog_network_line(line, fsm): raise hfst.exceptions.NotValidPrologFormatException( line, "", linecount[0] + linecount_) while (True): line = f.readline() if (line == ""): retval = HfstTransducer(fsm, get_default_fst_type()) retval.set_name(fsm.name) linecount[0] = linecount[0] + linecount_ return retval line = line.rstrip() linecount_ = linecount_ + 1 if line == "": # prolog separator retval = HfstTransducer(fsm, get_default_fst_type()) retval.set_name(fsm.name) linecount[0] = linecount[0] + linecount_ return retval if libhfst.parse_prolog_arc_line(line, fsm): pass elif libhfst.parse_prolog_final_line(line, fsm): pass elif libhfst.parse_prolog_symbol_line(line, fsm): pass else: raise hfst.exceptions.NotValidPrologFormatException( line, "", linecount[0] + linecount_)
def read_att_input(): """ Create a transducer as defined in AT&T format in user input. An empty line signals the end of input. """ linecount = 0 fsm = HfstBasicTransducer() while True: line = input().rstrip() if line == "": break linecount = linecount + 1 if not _parse_att_line(line, fsm): raise hfst.exceptions.NotValidAttFormatException(line, "", linecount) return HfstTransducer(fsm, get_default_fst_type())
def read_att_transducer(f, epsilonstr=EPSILON, linecount=[0]): """ Create a transducer as defined in AT&T format in file *f*. *epsilonstr* defines how epsilons are represented. *linecount* keeps track of the current line in the file. """ linecount_ = 0 fsm = HfstBasicTransducer() while True: line = f.readline() if line == "": if linecount_ == 0: raise hfst.exceptions.EndOfStreamException("","",0) else: linecount_ = linecount_ + 1 break linecount_ = linecount_ + 1 if line[0] == '-': break if not _parse_att_line(line, fsm, epsilonstr): raise hfst.exceptions.NotValidAttFormatException(line, "", linecount[0] + linecount_) linecount[0] = linecount[0] + linecount_ return HfstTransducer(fsm, get_default_fst_type())
def tokenized_fst(arg, weight=0): """ Get a transducer that recognizes the concatenation of symbols or symbol pairs in *arg*. Parameters ---------- * `arg` : The symbols or symbol pairs that form the path to be recognized. Example import hfst tok = hfst.HfstTokenizer() tok.add_multichar_symbol('foo') tok.add_multichar_symbol('bar') tr = hfst.tokenized_fst(tok.tokenize('foobar', 'foobaz')) will create the transducer [foo:foo bar:b 0:a 0:z]. """ retval = HfstBasicTransducer() state = 0 if isinstance(arg, list) or isinstance(arg, tuple): for token in arg: if isinstance(token, str): new_state = retval.add_state() retval.add_transition(state, new_state, token, token, 0) state = new_state elif isinstance(token, list) or isinstance(token, tuple): if len(token) == 2: new_state = retval.add_state() retval.add_transition(state, new_state, token[0], token[1], 0) state = new_state elif len(token) == 1: new_state = retval.add_state() retval.add_transition(state, new_state, token, token, 0) state = new_state else: raise RuntimeError('Symbol or symbol pair must be given.') retval.set_final_weight(state, weight) return HfstTransducer(retval, get_default_fst_type()) else: raise RuntimeError('Argument must be a list or a tuple')
def tokenized_fst(arg, weight=0): """ Get a transducer that recognizes the concatenation of symbols or symbol pairs in *arg*. Parameters ---------- * `arg` : The symbols or symbol pairs that form the path to be recognized. Example import hfst tok = hfst.HfstTokenizer() tok.add_multichar_symbol('foo') tok.add_multichar_symbol('bar') tr = hfst.tokenized_fst(tok.tokenize('foobar', 'foobaz')) will create the transducer [foo:foo bar:b 0:a 0:z]. """ retval = HfstBasicTransducer() state = 0 if isinstance(arg, list) or isinstance(arg, tuple): for token in arg: if isinstance(token, str): new_state = retval.add_state() retval.add_transition(state, new_state, token, token, 0) state = new_state elif isinstance(token, list) or isinstance(token, tuple): if len(token) == 2: new_state = retval.add_state() retval.add_transition(state, new_state, token[0], token[1], 0) state = new_state elif len(token) == 1: new_state = retval.add_state() retval.add_transition(state, new_state, token, token, 0) state = new_state else: raise RuntimeError('Symbol or symbol pair must be given.') retval.set_final_weight(state, weight) return HfstTransducer(retval, get_default_fst_type()) else: raise RuntimeError('Argument must be a list or a tuple')