def seq_to_transducer(alignment, weight=0.0, type=None, alphabet=None): if type is None: type=shared.config['FST'].getint('transducer_type') tr = hfst.HfstBasicTransducer() if alphabet is None: alphabet = () alphabet = tuple(sorted(set(alphabet) | set(sum(alignment, ())))) tr.add_symbols_to_alphabet(alphabet) last_state_id = 0 for (x, y) in alignment: state_id = tr.add_state() if (x, y) == (hfst.IDENTITY, hfst.IDENTITY): tr.add_transition(last_state_id, hfst.HfstBasicTransition(state_id, hfst.IDENTITY, hfst.IDENTITY, 0.0)) tr.add_transition(state_id, hfst.HfstBasicTransition(state_id, hfst.IDENTITY, hfst.IDENTITY, 0.0)) for a in tr.get_alphabet(): if not a.startswith('@_'): tr.add_transition(last_state_id, hfst.HfstBasicTransition(state_id, a, a, 0.0)) tr.add_transition(state_id, hfst.HfstBasicTransition(state_id, a, a, 0.0)) else: tr.add_transition(last_state_id, hfst.HfstBasicTransition(state_id, x, y, 0.0)) last_state_id = state_id tr.set_final_weight(last_state_id, weight) return hfst.HfstTransducer(tr, type)
def add_identity_loop(tr, alphabet, state): for c in alphabet: if c not in (hfst.EPSILON, hfst.IDENTITY, hfst.UNKNOWN): tr.add_transition(state, hfst.HfstBasicTransition(state+1, c, c, 0.0)) tr.add_transition(state+1, hfst.HfstBasicTransition(state+1, c, c, 0.0)) return state+1
def tag_absorber(alphabet): tr = hfst.HfstBasicTransducer() for c in alphabet: if shared.compiled_patterns['symbol'].match(c): tr.add_transition(0, hfst.HfstBasicTransition(0, c, c, 0.0)) elif shared.compiled_patterns['tag'].match(c): tr.add_transition(0, hfst.HfstBasicTransition(1, c, hfst.EPSILON, 0.0)) tr.add_transition(1, hfst.HfstBasicTransition(1, c, hfst.EPSILON, 0.0)) tr.set_final_weight(0, 0.0) tr.set_final_weight(1, 0.0) return hfst.HfstTransducer(tr)
def add_deletion_chain(tr, alphabet, state, length): tr.add_transition(state, hfst.HfstBasicTransition( state+1, hfst.EPSILON, deletion_slot_symbol, 0.0)) for i in range(1, length+1): for c in alphabet: if c not in (hfst.EPSILON, hfst.IDENTITY, hfst.UNKNOWN): tr.add_transition(state+i, hfst.HfstBasicTransition( state+i+1, c, deletion_symbol, 0.0)) last_state = state + length + 1 for i in range(length+1): tr.add_transition(state+i, hfst.HfstBasicTransition( last_state, hfst.EPSILON, hfst.EPSILON, 0.0)) return last_state
def tag_acceptor(tag, alphabet): tr = hfst.HfstBasicTransducer() for c in alphabet: if shared.compiled_patterns['symbol'].match(c): tr.add_transition(0, hfst.HfstBasicTransition(0, c, c, 0.0)) tr.set_final_weight(0, 0.0) tr_c = hfst.HfstTransducer(tr) tr_c.concatenate(seq_to_transducer(tuple(zip(tag, tag)))) return tr_c
def rootgen_transducer(rootdist): # create an automaton for word generation if shared.config['Features'].getint('rootdist_n') != 1: raise NotImplementedError('Not implemented for rootdist_n != 1') weights = rootdist.features[0].log_probs tr = hfst.HfstBasicTransducer() tr.set_final_weight(0, weights[('#',)]) for char, weight in weights.items(): if char != ('#',): tr.add_transition(0, hfst.HfstBasicTransition(0, char[0], char[0], weight)) return hfst.HfstTransducer(tr)
def to_hfst(self) -> hfst.HfstTransducer: result = hfst.HfstBasicTransducer() for state in self.states.values(): total_freq = state.get_total_freq() for t in state.transitions.values(): weight = -math.log(t.freq / total_freq) result.add_transition( state.id, hfst.HfstBasicTransition(t.target_state_id, t.symbol, t.symbol, weight)) if state.final_freq > 0: final_weight = -math.log(state.final_freq / total_freq) result.set_final_weight(state.id, final_weight) return hfst.HfstTransducer(result)
def delfilter(alphabet, length, deletion_symbol='@_DEL_@', deletion_slot_symbol='@_DELSLOT_@'): tr = hfst.HfstBasicTransducer() tr.set_final_weight(0, 0.0) tr.add_transition(0, hfst.HfstBasicTransition( 0, deletion_slot_symbol, deletion_slot_symbol, 0.0)) printable_chars = set(alphabet) -\ { hfst.EPSILON, hfst.IDENTITY, hfst.UNKNOWN, deletion_symbol } for i in range(length): for c in printable_chars: tr.add_transition(i, hfst.HfstBasicTransition(i+1, c, c, 0.0)) tr.add_transition(i+1, hfst.HfstBasicTransition( i, deletion_symbol, hfst.EPSILON, 0.0)) tr.add_transition(i+1, hfst.HfstBasicTransition( i+1, deletion_slot_symbol, deletion_slot_symbol, 0.0)) tr.set_final_weight(i+1, 0.0) first_negative_state = length+1 tr.add_transition(0, hfst.HfstBasicTransition( first_negative_state, deletion_symbol, hfst.EPSILON, 0.0)) for c in printable_chars: tr.add_transition(first_negative_state, hfst.HfstBasicTransition(0, c, c, 0.0)) for i in range(length-1): tr.add_transition(first_negative_state+i, hfst.HfstBasicTransition( first_negative_state+i+1, deletion_symbol, hfst.EPSILON, 0.0)) tr.add_transition(first_negative_state+i+1, hfst.HfstBasicTransition( first_negative_state+i+1, deletion_slot_symbol, deletion_slot_symbol, 0.0)) for c in printable_chars: tr.add_transition(first_negative_state+i+1, hfst.HfstBasicTransition( first_negative_state+i, c, c, 0.0)) tr_c = hfst.HfstTransducer(tr) return tr_c
def accum_input_labels(fst, separator=""): """Encode, weight and prune a transducer fst -- transducer to be processed, input labels are strings of alphabet symbols and output labels are single alphabet symbols separator -- null string or a symbol not part of the alphabet Returns a transducer where input labels of thrasitions are concatenations of the input label and the output label of the original transition, the weights are according to the weights of the resulting morphophonemes and all transitions with invalid morphophoneme labels are discarded. """ if cfg.verbosity >= 10: print("to be accumulated:\n", fst) bfst = hfst.HfstBasicTransducer(fst) result_bfst = hfst.HfstBasicTransducer() for state in bfst.states(): result_bfst.add_state(state) if bfst.is_final_state(state): weight = bfst.get_final_weight(state) result_bfst.set_final_weight(state, weight) for arc in bfst.transitions(state): tostate = arc.get_target_state() insym = arc.get_input_symbol() outsym = arc.get_output_symbol() weight = arc.get_weight() new_insym = insym + separator + outsym if cfg.verbosity >= 25: print("arc", state, tostate, insym, outsym, weight) if not alphabet.mphon_is_valid(new_insym): continue new_weight = alphabet.mphon_weight(new_insym) result_arc = hfst.HfstBasicTransition(tostate, new_insym, new_insym, new_weight) result_bfst.add_transition(state, result_arc) if cfg.verbosity >= 25: print("after addition of transition:\n", result_bfst) result_fst = hfst.HfstTransducer(result_bfst) result_fst.minimize() if cfg.verbosity >= 10: print("accumulated fst:\n", result_fst) return result_fst
assert(tr.compare(hfst.empty_fst())) defs = {'foo':hfst.regex('Foo'), 'bar':hfst.regex('Bar')} tr = hfst.regex('foo bar', definitions=defs) assert(tr.compare(hfst.regex('Foo Bar'))) tr = hfst.regex('foo bar') assert(tr.compare(hfst.regex('foo bar'))) # print('\n--- Testing HfstBasicTransducer ---\n') # Create basic transducer, write it to file, read it, and test equivalence fsm = hfst.HfstBasicTransducer() fsm.add_state(0) fsm.add_state(1) fsm.set_final_weight(1, 0.3) tr = hfst.HfstBasicTransition(1, 'foo', 'bar', 0.5) fsm.add_transition(0, tr) fsm.add_transition(0, 0, 'baz', 'baz') fsm.add_transition(0, 0, 'baz', 'BAZ', 0.1) f = open('foo_basic', 'w') fsm.write_att(f) f.close() f = open('foo_basic', 'r') fsm2 = hfst.HfstBasicTransducer(hfst.read_att_transducer(f, hfst.EPSILON)) f.close() # Modify weights of a basic transducer fsm = hfst.HfstBasicTransducer() fsm.add_state(0)
tok.add_multichar_symbol('foo') tok.add_multichar_symbol('bar') tr = hfst.tokenized_fst(tok.tokenize('foobar', 'foobaz')) if not tr.compare(hfst.regex('[foo:foo bar:b 0:a 0:z]')): raise RuntimeError('') # HfstBasicTransducer # Create an empty transducer # The transducer has initially one start state (number zero) # that is not final fsm = hfst.HfstBasicTransducer() # Add two states to the transducer fsm.add_state(1) fsm.add_state(2) # Create a transition [foo:bar] leading to state 1 with weight 0.1 tr = hfst.HfstBasicTransition(1, 'foo', 'bar', 0.1) # and add it to state zero fsm.add_transition(0, tr) # Add a transition [baz:baz] with weight 0 from state 1 to state 2 fsm.add_transition(1, hfst.HfstBasicTransition(2, 'baz', 'baz', 0.0)) # Set state 2 as final with weight 0.3 fsm.set_final_weight(2, 0.3) # Go through all states for state, arcs in enumerate(fsm): for arc in arcs: print('%i ' % (state), end='') print(arc) if fsm.is_final_state(state): print('%i %f' % (state, fsm.get_final_weight(state)) ) for state in fsm.states():
def _generator_for_seq(seq): tr = hfst.HfstBasicTransducer() for i, c in enumerate(seq): tr.add_transition(i, hfst.HfstBasicTransition(i+1, c, c, 0.0)) tr.set_final_weight(len(seq), 0.0) return hfst.HfstTransducer(tr)