def apply(self, line): tok = hfst.HfstTokenizer() Transducer = hfst.HfstTransducer(self.fallbackTransducer) Transducer.push_weights_to_end() words = hfst.tokenized_fst(tok.tokenize(line)) words.compose(Transducer) words.minimize() return words
def similar_words_with_block_composition(words, transducer_path): def _compose_block(block, delenv, right_tr, tokenizer): tr = hfst.empty_fst() for word in block: tr.disjunct(hfst.tokenized_fst(tokenizer.tokenize(word))) tr.minimize() tr.compose(delenv) tr.minimize() tr.compose(right_tr) tr.minimize() return tr def _extract_unique_io_pairs(transducer): tr_b = hfst.HfstBasicTransducer(transducer) previous_io_pairs = [] for s in tr_b.states(): previous_io_pairs.append(set()) previous_io_pairs[0].add(('', '')) results = set() empty = False while not empty: empty = True current_io_pairs = [] for s in tr_b.states(): current_io_pairs.append(set()) for state, state_io_pairs in enumerate(previous_io_pairs): if state_io_pairs: empty = False if tr_b.is_final_state(state): results |= state_io_pairs for str_in, str_out in state_io_pairs: for transition in tr_b.transitions(state): target_state = transition.get_target_state() sym_in = transition.get_input_symbol() if sym_in == hfst.EPSILON: sym_in = '' elif sym_in in (hfst.IDENTITY, hfst.UNKNOWN): raise RuntimeError('Illegal symbol!') sym_out = transition.get_output_symbol() if sym_out == hfst.EPSILON: sym_out = '' elif sym_out in (hfst.IDENTITY, hfst.UNKNOWN): raise RuntimeError('Illegal symbol!') current_io_pairs[target_state].add( (str_in + sym_in, str_out + sym_out)) previous_io_pairs = current_io_pairs # convert the results to a dict results_dict = {} for word_1, word_2 in results: if word_1 not in results_dict: results_dict[word_1] = [] results_dict[word_1].append(word_2) return results_dict delenv, right_tr = FST.load_cascade(transducer_path) tok = hfst.HfstTokenizer() for sym in shared.multichar_symbols: tok.add_multichar_symbol(sym) for sym in delenv.get_alphabet(): if len(sym) > 1: tok.add_multichar_symbol(sym) block_size = shared.config['preprocess'].getint('block_size') count = 0 while count < len(words): block = words[count:count + block_size] tr = _compose_block(block, delenv, right_tr, tok) similar_words_for_word = _extract_unique_io_pairs(tr) for word in block: if word in similar_words_for_word: yield (word, similar_words_for_word[word]) else: yield (word, []) count += block_size
from aligner import build_aligner import argparse eps = hfst.EPSILON pad = '"<P>"' eps_pair = ( eps, eps, ) pad_pair = ( pad, pad, ) tok = hfst.HfstTokenizer() levenshtein = hfst.regex('[ ?::0 | ?:?::1 | 0:?::1 | ?:0::1 | 0:0::0 ]*') cldict = { '\\': '\\\\', '\x84': '', } def clean(s): """ Remove and escape certain characters """ for a, b in cldict.items(): s = s.replace(a, b)