def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' return pynini.concat( pynini.transducer("", "<Kompos_Stems>", output_token_type=self.__syms.alphabet), pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Sg>", token_type=self.__syms.alphabet))), pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Pl>", token_type=self.__syms.alphabet))))), tmp), pynini.acceptor("<NN>", token_type=self.__syms.alphabet), pynini.transducer( "", "<kompos> <nativ>", output_token_type=self.__syms.alphabet)).optimize()
def __construct_r1(self): ''' Umlaut Apfel$ ==> Äpfel ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>", "<UL>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # r1a tau = pynini.push(pynini.string_map( [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"), ("U", "Ü")], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), push_labels=True) lc = pynini.union( self.__syms.consonants, pynini.string_map( ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() r1a = pynini.cdrewrite( tau, lc, pynini.concat( alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()) # r1c tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet) r1c = pynini.cdrewrite( tau, pynini.string_map( ["ä", "Ä"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.concat( self.__syms.consonants_lower, alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()).optimize() # r1d r1d = pynini.cdrewrite( pynini.transducer("<UL>", "<FB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), "", "", alphabet.closure()) return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' with pynini.default_token_type(self.__syms.alphabet): kompos_stems = pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Sg>")))), pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Pl>")))))), tmp) return (pynini.cross("", "<Kompos_Stems>") + kompos_stems + pynini.accep("<NN>") + pynini.cross("", "<kompos> <nativ>")).optimize()
def __construct_category_filter(self): ''' Filter-out non-matching category sequences ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, self.__syms.categories, ).closure() filtering = self.__suff_stems_filter([ "<ABK>", "<ADJ>", "<ADV>", "<CARD>", "<DIGCARD>", "<NE>", "<NN>", "<PRO>", "<V>", "<ORD>" ]) return pynini.concat( pynini.concat(alphabet, filtering).closure(), self.__tail).optimize()
def generate_fst_digit(): fst_dict = {} fst_single_digit = generate_fst_for_factor_digit(0, True) for factor in range(0, 10): fst_dict[factor] = generate_fst_for_factor_digit(factor) fst = pn.a("") for num_places in range(1, 10): fst_for_x_digit_num = pn.a("") for num_place in range(num_places, 0, -1): if num_places == 1: fst_for_x_digit_num = pn.concat(fst_for_x_digit_num, fst_single_digit) else: fst_for_x_digit_num = pn.concat(fst_for_x_digit_num, fst_dict[num_place - 1]) fst = pn.union(fst, fst_for_x_digit_num) comma_numbers = pn.u(".", ",") + pn.u(*"0123456789").star fst = fst + pn.u("", comma_numbers) fst = fst.optimize() return fst
def __construct_suff_phon(self): ''' ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"), self.__syms.stem_types, ).closure() Tau = pynini.cross("i", "") Lambda = pynini.concat( pynini.union( pynini.accep("i"), pynini.concat( self.__syms.consonants.project("input"), pynini.accep("y") ) ), pynini.accep("<Suff_Stems>") ) return pynini.concat( pynini.cdrewrite( Tau, Lambda, "", alphabet.project("input") ), self.__tail ).optimize()
def load_lexicon(source, symbol_table): ''' Load lexica entries from source interpreting them using a given symbol table. ''' lex = pynini.Fst() lex.set_input_symbols(symbol_table) lex.set_output_symbols(symbol_table) # longest match, prefer complex over simple symbols tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U) for line in source: line = line.strip() if line: tmp = pynini.Fst() tmp.set_input_symbols(symbol_table) tmp.set_output_symbols(symbol_table) start = tmp.add_state() tmp.set_start(start) tmp.set_final(start) for token in tokenizer.findall(line): if token[1]: tmp = pynini.concat( tmp, pynini.transducer(token[0], token[1], input_token_type=symbol_table, output_token_type=symbol_table)) else: tmp = pynini.concat( tmp, pynini.acceptor(token[0], token_type=symbol_table)) lex = pynini.union(lex, tmp) return lex
def __construct_suff_phon(self): ''' ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, ).closure() Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet) Lambda = pynini.concat( pynini.union( pynini.acceptor("i", token_type=self.__syms.alphabet), pynini.concat( self.__syms.consonants.project(), pynini.acceptor("y", token_type=self.__syms.alphabet))), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet)) return pynini.concat( pynini.cdrewrite(Tau, Lambda, "", alphabet.project()), self.__tail).optimize()
def __construct_del_ge(self): ''' Case-dependent deletion of the ge marker ''' # delete <ge> at certain suffixes like 'ver' return pynini.concat( pynini.transducer("<no-ge>", "", input_token_type=self.__syms.alphabet), pynini.concat( pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), pynini.concat( pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>"]).project()).closure(), pynini.concat( pynini.transducer( "<V> <nativ>", "", input_token_type=self.__syms.alphabet), pynini.acceptor( "<NoDef>", token_type=self.__syms.alphabet).closure(0, 1), pynini.transducer( "<ge>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, self.__syms.stem_type_features, pynini.acceptor( "<nativ>", token_type=self.__syms.alphabet))))).optimize()
def __construct_compound_filter(self): ''' Construct the compound filter ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"), self.__syms.stem_types, pynini.cross(self.__syms.categories, ""), pynini.cross(self.__syms.origin_features, ""), pynini.cross("<NoPref>", "") ) return pynini.concat( pynini.union( pynini.cross("<Initial>", ""), pynini.accep("<NoHy>"), pynini.accep("<NoDef>") ).closure(0,1), pynini.concat( pynini.union( pynini.concat( alphabet.closure(), pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "") ), pynini.concat( pynini.cross("", "<VADJ>"), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.concat( pynini.cross("<kompos>", ""), pynini.concat( alphabet.closure(), pynini.cross("<V>", "") ) ) ) ), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "") ) ), pynini.concat( pynini.cross("<base>", ""), pynini.concat( pynini.cross(self.__syms.origin_features, ""), self.__syms.inflection_classes ) ) ) ).optimize()
def __construct_tail(self): ''' Define possible final sequences of a derivation ''' # C1 initial_stuff = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<Pref_Stems>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).closure() # C2 intermediate_stuff = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Suff_Stems>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).closure() # C3 final_stuff = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.categories, self.__syms.stem_type_features, self.__syms.origin_features, pynini.string_map( [ "<NSNeut_es_e>", "<NSFem_0_n>", "<NSFem_0_en>", "<NSMasc_es_e>", "<NSMasc_es_$e>", "<NSMasc-s/$sse>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).closure() # TAIL return pynini.concat( pynini.concat(initial_stuff, self.__syms.base_stem_types, intermediate_stuff).closure(0, 1), final_stuff, self.__syms.inflection_classes.closure(0, 1)).optimize()
def __construct_r20(self): ''' Up to low ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( ["<^UC>", "<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # # SFST uses a rewrite rule here return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.transducer( "<CB>", "", input_token_type=self.__syms.alphabet).closure(1), pynini.union( pynini.string_map( ["<^UC>", "<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.to_lower))).closure(), push_labels=True).optimize()
def __construct_r21(self): ''' Low to up ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( ["<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) self.__syms.to_upper.draw("to_upper.dot") # Construction in SFST involves negation (which is expensiv). # It looks like we can do better: return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.transducer( "<^UC>", "", input_token_type=self.__syms.alphabet).closure(1), pynini.union( pynini.string_map( ["<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.to_upper))).closure(), push_labels=True).optimize()
def __construct_r14(self): ''' e-epenthesis 2 ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) tau = pynini.transducer("<DEL-S>", "e", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet) return pynini.cdrewrite( tau, pynini.union( pynini.concat( pynini.string_map( ["d", "t"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.acceptor("m", token_type=self.__syms.alphabet).closure( 0, 1)), pynini.acceptor("t w", token_type=self.__syms.alphabet)), "", alphabet.closure()).optimize()
def __construct_umlautung(self): ''' Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present. ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"), self.__syms.stem_types, self.__syms.categories, ).closure() return pynini.concat( pynini.concat( alphabet, pynini.concat( self.__syms.consonants, pynini.concat( pynini.union( pynini.union( pynini.cross("a", "ä"), pynini.cross("o", "ö"), pynini.cross("u", "ü") ), pynini.concat( pynini.cross("a", "ä"), pynini.union( pynini.cross("a", ""), pynini.accep("u") ) ) ), pynini.concat( self.__syms.consonants.closure(), pynini.concat( pynini.concat( pynini.accep("e"), pynini.string_map(["l", "r"]).project("input") ).closure(0, 1), pynini.concat( pynini.accep("<Suff_Stems>"), pynini.cross("<UL>", "") ) ) ) ) ).closure(0, 1) ), self.__tail ).optimize()
def __construct_del_ge(self): ''' Case-dependent deletion of the ge marker ''' with pynini.default_token_type(self.__syms.alphabet): # delete <ge> at certain prefixes like 'ver' return pynini.concat( pynini.cross("<no-ge>", ""), pynini.concat( pynini.accep("<Pref_Stems>"), pynini.concat( pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>"]).project("input") ).closure(), pynini.cross("<V> <nativ>", "") + pynini.accep("<NoDef>").closure(0, 1) + pynini.cross("<ge>", "") + self.__prefix_filter_helper + self.__syms.stem_type_features + pynini.accep("<nativ>") ) ) ).optimize()
def __construct_pref_stems(self): ''' Prefix stems ''' return pynini.compose( self.__lex, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __construct_base_stems(self): ''' Base stems ''' return pynini.compose( self.__bdk_stems, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __construct_origin_filter(self): ''' Filter-out non-matching origin feature sequences ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, self.__syms.categories, self.__syms.stem_type_features).closure().optimize() filtering = self.__suff_stems_filter([ "<nativ>", "<prefnativ>", "<frei>", "<gebunden>", "<kurz>", "<lang>", "<fremd>", "<klassisch>", "<NSNeut_es_e>", "<NSFem_0_n>", "<NSFem_0_en>", "<NSMasc_es_e>", "<NSMasc_es_$e>", "<NSMasc-s/$sse>", "<NGeo-$er-NMasc_s_0>", "<NGeo-$er-Adj0-Up>", "<NGeo-$isch-Adj+>", "<NGeo-0-Name-Fem_0>", "<NGeo-0-Name-Masc_s>", "<NGeo-0-Name-Neut_s>", "<NGeo-a-Name-Fem_s>", "<NGeo-a-Name-Neut_s>", "<NGeo-aner-NMasc_s_0>", "<NGeo-aner-Adj0-Up>", "<NGeo-anisch-Adj+>", "<NGeo-e-NMasc_n_n>", "<NGeo-e-Name-Fem_0>", "<NGeo-e-Name-Neut_s>", "<NGeo-ei-Name-Fem_0>", "<NGeo-en-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-er-Adj0-Up>", "<NGeo-0-NMasc_s_0>", "<NGeo-0-Adj0-Up>", "<NGeo-erisch-Adj+>", "<NGeo-ese-NMasc_n_n>", "<NGeo-esisch-Adj+>", "<NGeo-ianer-NMasc_s_0>", "<NGeo-ianisch-Adj+>", "<NGeo-ien-Name-Neut_s>", "<NGeo-ier-NMasc_s_0>", "<NGeo-isch-Adj+>", "<NGeo-istan-Name-Neut_s>", "<NGeo-land-Name-Neut_s>", "<NGeo-ner-NMasc_s_0>", "<NGeo-ner-Adj0-Up>", "<NGeo-nisch-Adj+>" ]) return pynini.concat( pynini.concat(alphabet, filtering).closure(), self.__tail).optimize()
def join(expr: pynini.FstLike, sep: pynini.FstLike) -> pynini.Fst: """Creates the automaton expr (sep expr)^*. Args: expr: an acceptor or string. sep: a separator acceptor or string. Returns: An FST. """ cdr = pynini.concat(sep, expr).closure() return expr + cdr
def __construct_bdk_stems(self): ''' Base, derivation and compound stems (without derivation suffixes) ''' return pynini.compose( self.__lex, pynini.concat( self.__syms.initial_features.closure(), pynini.string_map( ["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__sigma_star)).optimize()
def __construct_stem_type_filter(self): ''' Filter-out non-matching stem type sequences ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"), self.__syms.stem_types, self.__syms.categories, ).closure() filtering = self.__suff_stems_filter(["<deriv>", "<kompos>"]) return pynini.concat( pynini.concat( alphabet, filtering ).closure(), self.__tail ).optimize()
def __construct_pref_deriv_suff_stems(self): ''' Derivation suffixes which combine with prefixed stems ''' return pynini.compose( self.__lex, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet), pynini.transducer("<prefderiv>", "", input_token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __init__(self, syms, sublexica, deko_filter, inflection, phon): # # store alphabet self.__syms = syms # # run parts of morphology building (cf. timur_fst) tmp = (sublexica.verbal_pref_stems + sublexica.base_stems ) * sublexica.nodef_to_null * deko_filter.pref_filter tmp = (sublexica.base_stems | tmp) * deko_filter.compound_filter # ANY TODO: Move to symbols! alphabet = pynini.union( syms.characters, syms.stem_types, pynini.string_map([ "<FB>", "<SS>", "<n>", "<~n>", "<e>", "<d>", "<Ge-Nom>", "<UL>", "<NoHy>", "<NoDef>", "<ge>", "<Ge>", "<no-ge>", "<CB>" ], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project() ).closure().optimize() tmp = (tmp + inflection.inflection) * ( alphabet + inflection.inflection_filter ) * deko_filter.infix_filter * deko_filter.uplow tmp = pynini.compose( pynini.concat( pynini.transducer("", "<WB>", output_token_type=self.__syms.alphabet), tmp, pynini.transducer("", "<WB>", output_token_type=self.__syms.alphabet), ), phon.phon).optimize() # # default stems # create a default composition stem for nouns self.__compound_stems_nn = self.__construct_compound_stems_nn(tmp) # create a deriv stem for Ge nominalization (Gelerne) self.__ge_nom_stems_v = self.__construct_ge_nom_stems_v(tmp) # create an adjective base stem from participles self.__participle_adj = self.__construct_participle_adj(tmp, sublexica) self.__participle_adj.draw("participle_adj.dot", portrait=True)
def __construct_ge_nom_stems_v(self, tmp): ''' Stems for ge nominalization of verbs ("Gejammer") ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, self.__syms.categories, pynini.string_map(["<CONV>", "<SUFF>"]).project("input")) # extract infinitives infinitives = pynini.compose( pynini.concat( pynini.concat(self.__syms.characters.closure(1), pynini.accep("<PREF>")).closure(), pynini.concat(alphabet.closure(1), pynini.cross("", "<+V> <Inf>"))), tmp).optimize() insert_ge = pynini.concat( pynini.concat(self.__syms.characters.closure(1), pynini.accep("<PREF>")).closure(), pynini.concat(pynini.cross("g e <PREF> <Ge>", ""), alphabet.closure(1))).optimize() inserted_ge = pynini.compose( pynini.compose(insert_ge, infinitives).project("input"), pynini.union(self.__syms.to_lower, self.__syms.categories, self.__syms.prefix_suffix_marker, pynini.accep("<Ge>")).closure()).optimize() deriv_stem_filter_ge = pynini.compose( pynini.compose( pynini.compose( pynini.union(alphabet, pynini.accep("<PREF>"), pynini.cross("", "<Ge>")).closure(), inserted_ge), pynini.union( self.__syms.characters, pynini.accep("<Ge>"), pynini.cross( pynini.union(self.__syms.categories, self.__syms.prefix_suffix_marker), "")).closure()), pynini.concat( pynini.union( self.__syms.characters, pynini.accep("<Ge>"), ).closure(1), pynini.cross("e n", ""))).optimize() return (pynini.cross("", "<Deriv_Stems>") + deriv_stem_filter_ge + pynini.accep("<V>") + pynini.cross("", "<deriv> <nativ>")).optimize()
def __suff_stems_filter(self, features): ''' Return a union over filters for each feature given ''' filtering = pynini.Fst() filtering.set_input_symbols(self.__syms.alphabet) filtering.set_output_symbols(self.__syms.alphabet) suff_stems = pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet) for feature in features: to_eps = pynini.transducer(feature, "", input_token_type=self.__syms.alphabet) filtering = pynini.union(filtering, pynini.concat(to_eps, suff_stems, to_eps)) return filtering.optimize()
def __construct_quant_suff_stems(self): ''' Derivation suffixes which combine with a number and a simplex stem ''' return pynini.compose( self.__lex, pynini.concat( pynini.transducer("<QUANT>", "", input_token_type=self.__syms.alphabet), self.__syms.initial_features.closure(), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet), pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __construct_insert_zu(self): ''' Inserts "zu" into infinitives with separable prefixes ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() c2 = pynini.union(alphabet, self.__syms.stem_types).closure().optimize() # From deko.fst: # insert "zu" after verbal prefixes if followed by infinitive marker return pynini.union( c2, #pynini.concat( # pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), # alphabet.closure(), # pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet), # alphabet.closure() # ), pynini.concat( c2, pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), alphabet.closure(), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), pynini.transducer("", "z u", output_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet), alphabet.closure())).optimize()
def __construct_r14(self): ''' e-epenthesis 2 ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map([ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ]).project("input")) tau = pynini.cross("<DEL-S>", "e") return pynini.cdrewrite( tau, pynini.union( pynini.concat( pynini.string_map(["d", "t"]).project("input"), pynini.accep("m").closure(0, 1)), pynini.accep("t w")), "", alphabet.closure()).optimize()
def __construct_r20(self): ''' Up to low ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<^UC>", "<NoHy>", "<NoDef>"]).project("input")) # # SFST uses a rewrite rule here return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.cross("<CB>", "").closure(1), pynini.union( pynini.string_map(["<^UC>", "<NoHy>", "<NoDef>"]).project("input"), self.__syms.to_lower))).closure(), push_labels=True).optimize()