def __construct_suff_phon(self): ''' ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, ).closure() Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet) Lambda = pynini.concat( pynini.union( pynini.acceptor("i", token_type=self.__syms.alphabet), pynini.concat( self.__syms.consonants.project(), pynini.acceptor("y", token_type=self.__syms.alphabet))), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet)) return pynini.concat( pynini.cdrewrite(Tau, Lambda, "", alphabet.project()), self.__tail).optimize()
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' return pynini.concat( pynini.transducer("", "<Kompos_Stems>", output_token_type=self.__syms.alphabet), pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Sg>", token_type=self.__syms.alphabet))), pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Pl>", token_type=self.__syms.alphabet))))), tmp), pynini.acceptor("<NN>", token_type=self.__syms.alphabet), pynini.transducer( "", "<kompos> <nativ>", output_token_type=self.__syms.alphabet)).optimize()
def __construct_r1(self): ''' Umlaut Apfel$ ==> Äpfel ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>", "<UL>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # r1a tau = pynini.push(pynini.string_map( [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"), ("U", "Ü")], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), push_labels=True) lc = pynini.union( self.__syms.consonants, pynini.string_map( ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() r1a = pynini.cdrewrite( tau, lc, pynini.concat( alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()) # r1c tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet) r1c = pynini.cdrewrite( tau, pynini.string_map( ["ä", "Ä"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.concat( self.__syms.consonants_lower, alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()).optimize() # r1d r1d = pynini.cdrewrite( pynini.transducer("<UL>", "<FB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), "", "", alphabet.closure()) return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
def __construct_r14(self): ''' e-epenthesis 2 ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) tau = pynini.transducer("<DEL-S>", "e", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet) return pynini.cdrewrite( tau, pynini.union( pynini.concat( pynini.string_map( ["d", "t"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.acceptor("m", token_type=self.__syms.alphabet).closure( 0, 1)), pynini.acceptor("t w", token_type=self.__syms.alphabet)), "", alphabet.closure()).optimize()
def __construct_del_ge(self): ''' Case-dependent deletion of the ge marker ''' # delete <ge> at certain suffixes like 'ver' return pynini.concat( pynini.transducer("<no-ge>", "", input_token_type=self.__syms.alphabet), pynini.concat( pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), pynini.concat( pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>"]).project()).closure(), pynini.concat( pynini.transducer( "<V> <nativ>", "", input_token_type=self.__syms.alphabet), pynini.acceptor( "<NoDef>", token_type=self.__syms.alphabet).closure(0, 1), pynini.transducer( "<ge>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, self.__syms.stem_type_features, pynini.acceptor( "<nativ>", token_type=self.__syms.alphabet))))).optimize()
def example0(): # x is a vector of FST objects s = u"Pont l'Evêque" x = pynini.acceptor(s) print(u"Byte string acceptor from %s" % s) print(x) y = pynini.acceptor(u"Pont l'Evêque", token_type="utf8") print(u"utf8 string acceptor from %s" % s) print(y)
def __construct_verbal_pref_stems(self): ''' Verbal prefix stems ''' return pynini.compose( self.__pref_stems, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), self.__sigma_star, pynini.acceptor("<V>", token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def load_lexicon(source, symbol_table): ''' Load lexica entries from source interpreting them using a given symbol table. ''' lex = pynini.Fst() lex.set_input_symbols(symbol_table) lex.set_output_symbols(symbol_table) # longest match, prefer complex over simple symbols tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U) for line in source: line = line.strip() if line: tmp = pynini.Fst() tmp.set_input_symbols(symbol_table) tmp.set_output_symbols(symbol_table) start = tmp.add_state() tmp.set_start(start) tmp.set_final(start) for token in tokenizer.findall(line): if token[1]: tmp = pynini.concat( tmp, pynini.transducer(token[0], token[1], input_token_type=symbol_table, output_token_type=symbol_table)) else: tmp = pynini.concat( tmp, pynini.acceptor(token[0], token_type=symbol_table)) lex = pynini.union(lex, tmp) return lex
def process_window(input_str, window_fst, model, pruning_weight=5, rejection_weight=1.5): ''' Compose a window input automaton with the model. ''' t1 = time.time() window_fst.relabel_tables(new_isymbols=model[0].output_symbols(), new_osymbols=model[0].output_symbols()) for fst in model: window_fst = pynini.compose(window_fst, fst) window_fst.project(project_output=True) window_fst.prune(weight=pruning_weight) window_fst.optimize() t3 = time.time() logging.debug('- composition: {}s'.format(t3 - t1)) # allow also identity for windows of length 1 # (with weight `rejection_weight`) if ' ' not in input_str: # The formula: # rejection_weight*(len(input_str)+2) # means that rejection_weight*2 is the initial cost of having an OOV # word (which is than more expensive with increasing length). # While discovered by accident, this turned out to work well as # a very naive OOV word model. window_fst.union( pynini.acceptor(escape_for_pynini(input_str), weight=rejection_weight * (len(input_str) + 2))) t2 = time.time() logging.debug('Total processing time: {}s'.format(t2 - t1)) return window_fst
def create_window(tokens): ''' Create a window for the given input tokens (supplied as a list of strings). ''' result = pynini.acceptor(escape_for_pynini(' '.join(tokens))) return result
def rewrite(self, i: str) -> str: lattice = pynini.acceptor(i, token_type=self.token_type) @ self.rule if lattice.start() == pynini.NO_STATE_ID: logging.error("Composition failure: %s", i) return "<composition failure>" return pynini.shortestpath(lattice).stringify( token_type=self.token_type)
def rewrite(self, i: str) -> str: lattice = ( pynini.acceptor(i, token_type=self.input_token_type) @ self.fst) if lattice.start() == pynini.NO_STATE_ID: logging.error("Composition failure: %s", i) return "<composition failure>" lattice.project(True).rmepsilon() return pynini.shortestpath(lattice).string(self.output_token_type)
def __construct_insert_zu(self): ''' Inserts "zu" into infinitives with separable prefixes ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() c2 = pynini.union(alphabet, self.__syms.stem_types).closure().optimize() # From deko.fst: # insert "zu" after verbal prefixes if followed by infinitive marker return pynini.union( c2, #pynini.concat( # pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), # alphabet.closure(), # pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet), # alphabet.closure() # ), pynini.concat( c2, pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), alphabet.closure(), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), pynini.transducer("", "z u", output_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet), alphabet.closure())).optimize()
def far_compile_string(self, string, lex_in, unknown_symbol): new_string = "" for w in string.split(" "): if not lex_in.member(w): new_string += unknown_symbol + " " else: new_string += w + " " new_string = new_string.strip() a = pynini.acceptor(new_string, token_type=lex_in) return a
def recombine_windows(window_fsts): ''' Recombine processed window FSTs (containing hypotheses for a given window) to a lattice, which is also represented as an FST. ''' def _label(pos, length): return 'WIN-{}-{}'.format(pos, length) t1 = time.time() space_tr = pynini.acceptor(' ') # determine the input string length and max. window size # (TODO without iterating!!!) num_tokens = max(i for (i, j) in window_fsts) + 1 max_window_size = max(j for (i, j) in window_fsts) root = pynini.Fst() for i in range(num_tokens + 1): s = root.add_state() root.set_start(0) root.set_final(num_tokens, 0) # FIXME refactor the merging of symbol tables into a separate function symbol_table = pynini.SymbolTable() for window_fst in window_fsts.values(): symbol_table = pynini.merge_symbol_table(symbol_table, window_fst.input_symbols()) symbol_table = pynini.merge_symbol_table(symbol_table, window_fst.output_symbols()) for (pos, length), window_fst in window_fsts.items(): label = _label(pos, length) sym = symbol_table.add_symbol(label) root.set_input_symbols(symbol_table) root.set_output_symbols(symbol_table) replacements = [] for (pos, length), window_fst in window_fsts.items(): label = _label(pos, length) sym = root.output_symbols().find(label) if pos + length < num_tokens: # append a space if this is not the last token, so that the final # string consists of tokens separated by spaces window_fst.concat(space_tr) replacements.append((label, window_fst)) root.add_arc(pos, pynini.Arc(0, sym, 0, pos + length)) result = pynini.replace(root, replacements) result.optimize() t2 = time.time() logging.debug('Recombining time: {}s'.format(t2 - t1)) return result
def __construct_base_stems(self): ''' Base stems ''' return pynini.compose( self.__bdk_stems, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def lexicon_to_window_fst(lexicon_fst, words_per_window=2): ''' Concatenate the lexicon FST `words_per_window` times, inserting spaces in between. The resulting FST accepts up to `words_per_window` words from the lexicon. ''' result = lexicon_fst.copy() if words_per_window == 1: return result result.concat(pynini.acceptor(' ')) result.closure(0, words_per_window - 1) result.concat(lexicon_fst) return result
def __construct_pref_deriv_suff_stems(self): ''' Derivation suffixes which combine with prefixed stems ''' return pynini.compose( self.__lex, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet), pynini.transducer("<prefderiv>", "", input_token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def lookup(self, string): ''' Analyse a string ''' result = [] if self.__verify(): string_acceptor = pynini.acceptor(" ".join(c for c in string), token_type=self.__syms.alphabet) intermediate = pynini.compose(self.__timur, string_acceptor) paths = intermediate.paths( input_token_type=intermediate.input_symbols(), output_token_type=intermediate.output_symbols()) result = list(paths.items()) return result
def __suff_stems_filter(self, features): ''' Return a union over filters for each feature given ''' filtering = pynini.Fst() filtering.set_input_symbols(self.__syms.alphabet) filtering.set_output_symbols(self.__syms.alphabet) suff_stems = pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet) for feature in features: to_eps = pynini.transducer(feature, "", input_token_type=self.__syms.alphabet) filtering = pynini.union(filtering, pynini.concat(to_eps, suff_stems, to_eps)) return filtering.optimize()
def __construct_quant_suff_stems(self): ''' Derivation suffixes which combine with a number and a simplex stem ''' return pynini.compose( self.__lex, pynini.concat( pynini.transducer("<QUANT>", "", input_token_type=self.__syms.alphabet), self.__syms.initial_features.closure(), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet), pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __init__(self, syms, lexicon): # # store alphabet self.__syms = syms # # store lexicon self.__lex = lexicon # # (private) helpers self.__sigma_star = pynini.union( syms.characters, syms.categories, syms.stem_types, syms.stem_type_features, syms.origin_features, syms.circumfix_features, syms.inflection_classes, syms.geo_inflection_classes, pynini.acceptor("<ge>", token_type=syms.alphabet ) # for word-internal <ge> (ausgewertet) ).closure().optimize() # # NoDef2NULL self.__nodef_to_null = pynini.union( self.__sigma_star, syms.origin_features, pynini.transducer("<NoDef>", "", input_token_type=self.__syms.alphabet), syms.stem_types).closure().optimize() # # sublexica self.__bdk_stems = self.__construct_bdk_stems() self.__base_stems = self.__construct_base_stems() self.__pref_stems = self.__construct_pref_stems() self.__verbal_pref_stems = self.__construct_verbal_pref_stems() self.__simplex_suff_stems = self.__construct_simplex_suff_stems() self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems() self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems() self.__quant_suff_stems = self.__construct_quant_suff_stems()
def __construct_participle_adj(self, tmp, sublexica): ''' Stems for conversion of participles into adjectives ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map([ "<VPART>", "<VPREF>", "<PREF>", "<CONV>", "<SUFF>", "<NN>", "<ADJ>", "<V>", "<FT>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project( )).closure().optimize() return pynini.concat( pynini.transducer("", "<Base_Stems>", output_token_type=self.__syms.alphabet), pynini.union( pynini.concat( pynini.compose( pynini.concat( alphabet, pynini.transducer( "<V>", "<+V>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), pynini.acceptor( "<zu>", token_type=self.__syms.alphabet).closure(0, 1), pynini.acceptor("<PPast>", token_type=self.__syms.alphabet)), pynini.compose( tmp, pynini.concat( sublexica.nodef_to_null, pynini.acceptor( "t", token_type=self.__syms.alphabet)))), pynini.transducer("", "<ADJ>", output_token_type=self.__syms.alphabet), pynini.transducer("<CONV>", "", input_token_type=self.__syms.alphabet), pynini.transducer("", "<base> <nativ> <Adj+e>", output_token_type=self.__syms.alphabet)), pynini.concat( pynini.compose( pynini.concat( alphabet, pynini.transducer( "<V>", "<+V>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), pynini.acceptor( "<zu>", token_type=self.__syms.alphabet).closure(0, 1), pynini.string_map( ["<PPast>", "<PPres>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet). project()), pynini.compose( tmp, pynini.concat( sublexica.nodef_to_null, pynini.acceptor( "e n", token_type=self.__syms.alphabet) | pynini.acceptor( "n d", token_type=self.__syms.alphabet)))), pynini.transducer("", "<ADJ>", output_token_type=self.__syms.alphabet), pynini.transducer("<CONV>", "", input_token_type=self.__syms.alphabet), pynini.transducer( "", "<base> <nativ> <Adj+>", output_token_type=self.__syms.alphabet)))).optimize()
def get_paths(decode_graph, isymbs, osymbs, phoneme_list): phoneme_fst = pynini.acceptor(" ".join(phoneme_list), token_type = isymbs) return [path for path in pynini.compose(phoneme_fst, decode_graph).paths(input_token_type=isymbs, output_token_type=osymbs)]
def __construct_ge_nom_stems_v(self, tmp): ''' Stems for ge nominalization of verbs ("Gejammer") ''' alphabet = pynini.union( self.__syms.characters, self.__syms.categories, pynini.string_map( ["<CONV>", "<SUFF>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # extract infinitives infinitives = pynini.compose( pynini.concat( pynini.concat( self.__syms.characters.closure(1), pynini.acceptor( "<PREF>", token_type=self.__syms.alphabet)).closure(), alphabet.closure(1), pynini.transducer("", "<+V> <Inf>", output_token_type=self.__syms.alphabet)), tmp).optimize() insert_ge = pynini.concat( pynini.concat( self.__syms.characters.closure(1), pynini.acceptor("<PREF>", token_type=self.__syms.alphabet)).closure(), pynini.transducer("g e <PREF> <Ge>", "", input_token_type=self.__syms.alphabet), alphabet.closure(1)).optimize() inserted_ge = pynini.compose( pynini.compose(insert_ge, infinitives).project(), pynini.union( self.__syms.to_lower, self.__syms.categories, self.__syms.prefix_suffix_marker, pynini.acceptor( "<Ge>", token_type=self.__syms.alphabet)).closure()).optimize() return pynini.concat( pynini.transducer("", "<Deriv_Stems>", output_token_type=self.__syms.alphabet), pynini.compose( pynini.compose( pynini.compose( pynini.union( alphabet, pynini.acceptor("<PREF>", token_type=self.__syms.alphabet), pynini.transducer("", "<Ge>", output_token_type=self.__syms. alphabet)).closure(), inserted_ge), pynini.union( self.__syms.characters, pynini.acceptor("<Ge>", token_type=self.__syms.alphabet), pynini.transducer( pynini.union(self.__syms.categories, self.__syms.prefix_suffix_marker), "")).closure()), pynini.concat( pynini.union( self.__syms.characters, pynini.acceptor("<Ge>", token_type=self.__syms.alphabet), ).closure(1), pynini.transducer("e n", "", input_token_type=self.__syms.alphabet))), pynini.acceptor("<V>", token_type=self.__syms.alphabet), pynini.transducer( "", "<deriv> <nativ>", output_token_type=self.__syms.alphabet)).optimize()
["t", "d"]]) ##Courtesy of http://www.lysator.liu.se/language/Languages/Finnish/Grammar.html and https://web.stanford.edu/~kiparsky/Papers/finnish.article.pdf consonant_reduction = pynini.cdrewrite(double_consonants_reduce, "l" | vowels | "n", vowels + suffixes, closure).optimize() #Vowel insertion to break consonant clusters caused by suffixes insertion = pynini.cdrewrite(pynini.transducer("", "e"), consonants, suffixes, closure).optimize() #Finnish seems to attempt preserving morae count with /s/ as a syllabic end. Generates a stop that assimilates 'highness' of vowel and becomes /k/ #In case this generated stop occurs after VV, it instead assimilates /s/ and becomes /t/. Then gradation occurs due to /e/ insertion #Similar situation seemed to occur with /s/ -> /a/ / /a/_ + suffix. So was added to transducer. final_stress_preservation = pynini.cdrewrite( pynini.transducer("s", "t"), vowels + (pynini.acceptor("y") | "u"), suffixes, closure) * pynini.cdrewrite( pynini.transducer("", "k"), pynini.acceptor("y") | "u", "s" + suffixes, closure) * pynini.cdrewrite( pynini.transducer("s", "a"), "a", suffixes, closure) final_stress_preservation.optimize() #Rule for /nt/ assimilation. nt_assimilation = pynini.cdrewrite(pynini.transducer("t", "n"), "n", vowels + suffixes, closure).optimize() #Intersection of rules transducer_adessive = regularize * transducer_adessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize transducer_inessive = regularize * transducer_inessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize #########################Generates FAR ###############################3
#### dir_path = os.path.dirname(os.path.realpath(__file__)) ST = pynini.SymbolTable.read_text(f"{dir_path}/syms.txt") def draw(x, opt=True): if opt: x.optimize() x.draw(f"{dir_path}/obdd.dot", ST, ST, portrait=True, acceptor=True) #### # constants T = pynini.acceptor("T", token_type=ST) F = pynini.acceptor("F", token_type=ST) ANY = T | F TAUT = (T | F).closure() # sigma-star ABSURD = T - T # boolean operators def AND(*args): return reduce(pynini.intersect, map(v, args), TAUT) def OR(*args): return reduce(pynini.union, map(v, args), ABSURD)
def __construct_uplow(self): ''' Upper/Lower case markers ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<~n>", "<e>", "<d>", "<NoDef>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<^imp>", "<ge>", "<^zz>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() s = pynini.concat( alphabet, pynini.union( alphabet, pynini.acceptor( "<CB>", token_type=self.__syms.alphabet)).closure()).optimize() s2 = pynini.concat( pynini.union( pynini.concat( pynini.transducer("<CB>", "", input_token_type=self.__syms.alphabet), self.__syms.characters_upper), pynini.concat( pynini.transducer( "<CB>", "", input_token_type=self.__syms.alphabet).closure(0, 1), self.__syms.characters_lower)), s).optimize() return pynini.union( pynini.concat( pynini.transducer("<^UC>", "", input_token_type=self.__syms.alphabet), pynini.string_map( ["<NoDef>", "<NoHy>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project().closure( 0, 1), pynini.transducer("", "<^UC>", output_token_type=self.__syms.alphabet), s2, pynini.transducer("<Low#>", "", input_token_type=self.__syms.alphabet)), pynini.concat( pynini.acceptor("<NoHy>", token_type=self.__syms.alphabet).closure(0, 1), pynini.union( pynini.concat( pynini.transducer( "<CB>", "", input_token_type=self.__syms.alphabet), s, pynini.transducer( "<Fix#>", "", input_token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer(pynini.string_map( ["<CB>", "<epsilon>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), "<^UC>", output_token_type=self.__syms. alphabet), s, pynini.transducer( "<Up#>", "", input_token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer(pynini.string_map( ["<CB>", "<epsilon>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), "<CB>", output_token_type=self.__syms. alphabet), s, pynini.transducer("<Low#>", "", input_token_type=self.__syms.alphabet ))))).optimize()
def __construct_prefix_origin_filter(self): ''' Match origin of prefix and stem ''' return pynini.concat( pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), pynini.concat( pynini.union( self.__syms.characters, pynini.string_map([ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet). project()).closure(), pynini.union( pynini.concat( pynini.transducer( "<ADJ> <nativ>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<ADJ>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<nativ>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<ABK> <nativ>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<ABK>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<nativ>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<NN> <nativ>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<NN>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<nativ>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<NN> <fremd>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<NN>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<fremd>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<NE> <nativ>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<NE>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<nativ>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<NE> <fremd>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<NE>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<fremd>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<ADJ> <fremd>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<ADJ>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<fremd>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<V> <nativ>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<V>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.acceptor("<nativ>", token_type=self.__syms.alphabet)), pynini.concat( pynini.transducer( "<V> <nativ>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<V>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, self.__syms.ns_features), pynini.concat( pynini.transducer( "<ADJ> <klassisch>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<ADJ>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.string_map( ["<frei>", "<gebunden>", "<kurz>", "<lang>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()), pynini.concat( pynini.transducer( "<NN> <klassisch>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<NN>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.string_map( ["<frei>", "<gebunden>", "<kurz>", "<lang>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()), pynini.concat( pynini.transducer( "<V> <klassisch>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, pynini.acceptor("<V>", token_type=self.__syms.alphabet), self.__syms.stem_type_features, pynini.string_map( ["<frei>", "<gebunden>", "<kurz>", "<lang>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project())) )).optimize()
def __construct_insert_ge(self): ''' Inserts the prefix "ge" controlled by the symbol "<ge>" ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^zz>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() c2 = pynini.union(alphabet, self.__syms.stem_types).closure().optimize() # From deko.fst: # replace <ge> with "ge" if followed by perfect participle marker # or ge-nominalisation otherwise delete <ge> # in complex lexicon entries as for "haushalten" <ge> is not followed # by <Base_Stems> return pynini.union( c2, pynini.concat( c2, pynini.transducer("<ge>", "", input_token_type=self.__syms.alphabet), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet).closure(0, 1), pynini.transducer("", "g e", output_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer("<^pp>", "", input_token_type=self.__syms.alphabet), alphabet.closure()), pynini.concat( c2, pynini.acceptor("<Deriv_Stems>", token_type=self.__syms.alphabet).closure(0, 1), alphabet.closure(), pynini.transducer("<Ge>", "", input_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer("<Suff_Stems> <Ge-Nom>", "e", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), alphabet.closure()), pynini.concat( c2, pynini.transducer("<ge>", "", input_token_type=self.__syms.alphabet), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet).closure(0, 1), alphabet.closure()), pynini.concat( c2, pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet).closure(0, 1), alphabet.closure(), pynini.transducer("<^pp>", "", input_token_type=self.__syms.alphabet), alphabet.closure())).optimize()
def __construct_compound_filter(self): ''' Construct the compound filter ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, pynini.transducer(self.__syms.categories, ""), pynini.transducer(self.__syms.origin_features, ""), pynini.transducer("<NoPref>", "", input_token_type=self.__syms.alphabet)) return pynini.concat( pynini.union( pynini.transducer("<Initial>", "", input_token_type=self.__syms.alphabet), pynini.acceptor("<NoHy>", token_type=self.__syms.alphabet), pynini.acceptor("<NoDef>", token_type=self.__syms.alphabet)).closure( 0, 1), pynini.concat( pynini.union( pynini.concat( alphabet.closure(), pynini.transducer( pynini.string_map( [ "<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet). project(), "")), pynini.concat( pynini.transducer( "", "<VADJ>", output_token_type=self.__syms.alphabet), pynini.union( alphabet, pynini.transducer("<kompos>", "", input_token_type=self.__syms. alphabet)).closure(), pynini.transducer( "<kompos>", "", input_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer( "<V>", "", input_token_type=self.__syms.alphabet)), pynini.concat( pynini.union( alphabet, pynini.transducer("<kompos>", "", input_token_type=self.__syms. alphabet)).closure(), pynini.transducer( pynini.string_map( ["<ADJ>", "<NN>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet). project(), ""))), pynini.concat( pynini.transducer("<base>", "", input_token_type=self.__syms.alphabet), pynini.transducer(self.__syms.origin_features, ""), self.__syms.inflection_classes))).optimize()