def __construct_del_ge(self): ''' Case-dependent deletion of the ge marker ''' # delete <ge> at certain suffixes like 'ver' return pynini.concat( pynini.transducer("<no-ge>", "", input_token_type=self.__syms.alphabet), pynini.concat( pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), pynini.concat( pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>"]).project()).closure(), pynini.concat( pynini.transducer( "<V> <nativ>", "", input_token_type=self.__syms.alphabet), pynini.acceptor( "<NoDef>", token_type=self.__syms.alphabet).closure(0, 1), pynini.transducer( "<ge>", "", input_token_type=self.__syms.alphabet), self.__prefix_filter_helper, self.__syms.stem_type_features, pynini.acceptor( "<nativ>", token_type=self.__syms.alphabet))))).optimize()
def __init__(self, alphabet, insert_cost=DEFAULT_INSERT_COST, delete_cost=DEFAULT_DELETE_COST, substitute_cost=DEFAULT_SUBSTITUTE_COST): """Constructor. Args: alphabet: edit alphabet (an iterable of strings). insert_cost: the cost for the insertion operation. delete_cost: the cost for the deletion operation. substitute_cost: the cost for the substitution operation. """ # Left factor; note that we divide the edit costs by two because they also # will be incurred when traversing the right factor. match = union(*alphabet).optimize(True) i_insert = transducer("", "[{}]".format(self.INSERT), weight=insert_cost / 2).optimize(True) i_delete = transducer(match, "[{}]".format(self.DELETE), weight=delete_cost / 2).optimize(True) i_substitute = transducer(match, "[{}]".format(self.SUBSTITUTE), weight=substitute_cost / 2).optimize(True) i_ops = union(match, i_insert, i_delete, i_substitute).optimize(True) # Right factor; this is constructed by inverting the left factor (i.e., # swapping the input and output labels), then swapping the insert and delete # labels on what is now the input side. o_ops = invert(i_ops) syms = o_ops.input_symbols() insert_label = syms.find(self.INSERT) delete_label = syms.find(self.DELETE) o_ops.relabel_pairs(ipairs=((insert_label, delete_label), (delete_label, insert_label))) # Computes the closure for both sets of ops. self._e_i = i_ops.closure().optimize(True) self._e_o = o_ops.closure().optimize(True)
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' return pynini.concat( pynini.transducer("", "<Kompos_Stems>", output_token_type=self.__syms.alphabet), pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Sg>", token_type=self.__syms.alphabet))), pynini.transducer( "", pynini.concat( pynini.acceptor( "<+NN>", token_type=self.__syms.alphabet), self.__syms.gender, pynini.acceptor( "<Nom> <Pl>", token_type=self.__syms.alphabet))))), tmp), pynini.acceptor("<NN>", token_type=self.__syms.alphabet), pynini.transducer( "", "<kompos> <nativ>", output_token_type=self.__syms.alphabet)).optimize()
def __construct_r1(self): ''' Umlaut Apfel$ ==> Äpfel ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>", "<UL>", "<FB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # r1a tau = pynini.push(pynini.string_map( [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"), ("U", "Ü")], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), push_labels=True) lc = pynini.union( self.__syms.consonants, pynini.string_map( ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() r1a = pynini.cdrewrite( tau, lc, pynini.concat( alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()) # r1c tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet) r1c = pynini.cdrewrite( tau, pynini.string_map( ["ä", "Ä"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.concat( self.__syms.consonants_lower, alphabet.closure(), pynini.acceptor("<UL>", token_type=self.__syms.alphabet)), alphabet.closure()).optimize() # r1d r1d = pynini.cdrewrite( pynini.transducer("<UL>", "<FB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), "", "", alphabet.closure()) return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
def __init__(self, syms, sublexica, deko_filter, inflection, phon): # # store alphabet self.__syms = syms # # run parts of morphology building (cf. timur_fst) tmp = (sublexica.verbal_pref_stems + sublexica.base_stems ) * sublexica.nodef_to_null * deko_filter.pref_filter tmp = (sublexica.base_stems | tmp) * deko_filter.compound_filter # ANY TODO: Move to symbols! alphabet = pynini.union( syms.characters, syms.stem_types, pynini.string_map([ "<FB>", "<SS>", "<n>", "<~n>", "<e>", "<d>", "<Ge-Nom>", "<UL>", "<NoHy>", "<NoDef>", "<ge>", "<Ge>", "<no-ge>", "<CB>" ], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project() ).closure().optimize() tmp = (tmp + inflection.inflection) * ( alphabet + inflection.inflection_filter ) * deko_filter.infix_filter * deko_filter.uplow tmp = pynini.compose( pynini.concat( pynini.transducer("", "<WB>", output_token_type=self.__syms.alphabet), tmp, pynini.transducer("", "<WB>", output_token_type=self.__syms.alphabet), ), phon.phon).optimize() # # default stems # create a default composition stem for nouns self.__compound_stems_nn = self.__construct_compound_stems_nn(tmp) # create a deriv stem for Ge nominalization (Gelerne) self.__ge_nom_stems_v = self.__construct_ge_nom_stems_v(tmp) # create an adjective base stem from participles self.__participle_adj = self.__construct_participle_adj(tmp, sublexica) self.__participle_adj.draw("participle_adj.dot", portrait=True)
def transducer(cls, fsm1, fsm2): if not isinstance(fsm1, cls): fsm1 = PyniniWrapper.fromItem(fsm1) if not isinstance(fsm2, PyniniWrapper): fsm2 = PyniniWrapper.fromItem(fsm2) fsm = pynini.transducer(fsm1.fsm, fsm2.fsm) return cls(fsm)
def __construct_r21(self): ''' Low to up ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( ["<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) self.__syms.to_upper.draw("to_upper.dot") # Construction in SFST involves negation (which is expensiv). # It looks like we can do better: return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.transducer( "<^UC>", "", input_token_type=self.__syms.alphabet).closure(1), pynini.union( pynini.string_map( ["<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.to_upper))).closure(), push_labels=True).optimize()
def __construct_r20(self): ''' Up to low ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( ["<^UC>", "<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # # SFST uses a rewrite rule here return pynini.push(pynini.union( alphabet.closure(), pynini.concat( pynini.transducer( "<CB>", "", input_token_type=self.__syms.alphabet).closure(1), pynini.union( pynini.string_map( ["<^UC>", "<NoHy>", "<NoDef>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.to_lower))).closure(), push_labels=True).optimize()
def __construct_r14(self): ''' e-epenthesis 2 ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) tau = pynini.transducer("<DEL-S>", "e", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet) return pynini.cdrewrite( tau, pynini.union( pynini.concat( pynini.string_map( ["d", "t"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), pynini.acceptor("m", token_type=self.__syms.alphabet).closure( 0, 1)), pynini.acceptor("t w", token_type=self.__syms.alphabet)), "", alphabet.closure()).optimize()
def load_lexicon(source, symbol_table): ''' Load lexica entries from source interpreting them using a given symbol table. ''' lex = pynini.Fst() lex.set_input_symbols(symbol_table) lex.set_output_symbols(symbol_table) # longest match, prefer complex over simple symbols tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U) for line in source: line = line.strip() if line: tmp = pynini.Fst() tmp.set_input_symbols(symbol_table) tmp.set_output_symbols(symbol_table) start = tmp.add_state() tmp.set_start(start) tmp.set_final(start) for token in tokenizer.findall(line): if token[1]: tmp = pynini.concat( tmp, pynini.transducer(token[0], token[1], input_token_type=symbol_table, output_token_type=symbol_table)) else: tmp = pynini.concat( tmp, pynini.acceptor(token[0], token_type=symbol_table)) lex = pynini.union(lex, tmp) return lex
def __construct_suff_phon(self): ''' ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), self.__syms.stem_types, ).closure() Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet) Lambda = pynini.concat( pynini.union( pynini.acceptor("i", token_type=self.__syms.alphabet), pynini.concat( self.__syms.consonants.project(), pynini.acceptor("y", token_type=self.__syms.alphabet))), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet)) return pynini.concat( pynini.cdrewrite(Tau, Lambda, "", alphabet.project()), self.__tail).optimize()
def main(args: argparse.Namespace) -> None: # Sets of labels for the covering grammar. g_labels: Set[int] = set() p_labels: Set[int] = set() # Curries compiler and compactor functions for the FARs. compiler = functools.partial(pynini.acceptor, token_type=args.token_type, attach_symbols=False) compactor = functools.partial(pywrapfst.convert, fst_type="compact_string") logging.info("Constructing grapheme and phoneme FARs") g_writer = pywrapfst.FarWriter.create(args.g_far_path) p_writer = pywrapfst.FarWriter.create(args.p_far_path) with open(args.input_path, "r") as source: for (linenum, line) in enumerate(source, 1): key = f"{linenum:08x}" (g, p) = line.rstrip().split("\t", 1) # For both G and P, we compile a FSA, store the labels, and then # write the compact version to the FAR. g_fst = compiler(g) g_labels.update(g_fst.paths().ilabels()) g_writer[key] = compactor(g_fst) p_fst = compiler(p) p_labels.update(p_fst.paths().ilabels()) p_writer[key] = compactor(p_fst) logging.info("Processed %d examples", linenum) logging.info("Constructing covering grammar") logging.info("%d unique graphemes", len(g_labels)) g_side = _label_union(g_labels, args.input_epsilon) logging.info("%d unique phonemes", len(p_labels)) p_side = _label_union(p_labels, args.output_epsilon) # The covering grammar is given by (G x P)^*, a zeroth order Markov model. covering = pynini.transducer(g_side, p_side).closure().optimize() assert covering.num_states() == 1, "Covering grammar FST is ill-formed" logging.info("Covering grammar has %d arcs", _narcs(covering)) covering.write(args.covering_path)
def __construct_rep_pref(self): ''' Replace the marker of manually prefixed stems ''' return pynini.cdrewrite(pynini.transducer("<prefnativ>", "<nativ>"), "", "", self.__prefix_filter_helper).optimize()
def __construct_r13(self): ''' e-epenthesis 1 ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) return pynini.union( alphabet, pynini.transducer( pynini.string_map( [ "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>", "<^Ax>", "<WB>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), "")).closure().optimize()
def __split_disjunctive_feats(self, disjunctive_feat_list): single_splits = [] for disjunctive_feat in disjunctive_feat_list: splitted = [] for cat in disjunctive_feat[1:-1].split(","): splitted.append("<" + cat + ">") single_splits.append(pynini.transducer(disjunctive_feat, pynini.string_map(splitted, input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet)) return pynini.union(*(single_splits)).optimize()
def __construct_quant_suff_stems(self): ''' Derivation suffixes which combine with a number and a simplex stem ''' return pynini.compose( self.__lex, pynini.concat( pynini.transducer("<QUANT>", "", input_token_type=self.__syms.alphabet), self.__syms.initial_features.closure(), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet), pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __construct_insert_zu(self): ''' Inserts "zu" into infinitives with separable prefixes ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() c2 = pynini.union(alphabet, self.__syms.stem_types).closure().optimize() # From deko.fst: # insert "zu" after verbal prefixes if followed by infinitive marker return pynini.union( c2, #pynini.concat( # pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), # alphabet.closure(), # pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet), # alphabet.closure() # ), pynini.concat( c2, pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet), alphabet.closure(), pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet), pynini.transducer("", "z u", output_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet), alphabet.closure())).optimize()
def __init__(self): super().__init__() attr_map_0 = pynini.transducer(self.cable_digits, '#жил') attr_map_1 = pynini.transducer(self.cable_digits, 'Длина_кабеля') attr_map_2 = pynini.transducer(self.cable_digits, 'Диаметр') attr_map_3 = pynini.transducer(self.cable_digits, '#соединительных_проводов') attr_map_4 = pynini.transducer(' . ', ' Диаметр ') attr_map_0_lc = self.ngram_comb attr_map_0_rc = self.cable_splitters attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize() attr_map_3_lc = self.cable_splitters attr_map_3_rc = self.cable_splitters attr_map_3_s = pynini.cdrewrite(attr_map_3, attr_map_3_lc, attr_map_3_rc, self.alphabet).optimize() attr_map_2_lc = self.cable_floats attr_map_2_rc = self.cable_length_0 attr_map_2_s = pynini.cdrewrite(attr_map_2, attr_map_2_lc, attr_map_2_rc, self.alphabet).optimize() attr_map_4_lc = self.cable_splitters attr_map_4_rc = self.cable_floats attr_map_4_s = pynini.cdrewrite(attr_map_2, attr_map_4_lc, attr_map_4_rc, self.alphabet).optimize() attr_map_5_lc = self.cable_digits attr_map_5_rc = self.cable_digits attr_map_5_s = pynini.cdrewrite(attr_map_4, attr_map_5_lc, attr_map_5_rc, self.alphabet).optimize() attr_map_6_rc = self.cable_length_0 attr_map_6_lc = self.cable_length_1 attr_map_6_s = pynini.cdrewrite(attr_map_1, attr_map_6_rc, attr_map_6_lc, self.alphabet).optimize() attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_2_s, attr_map_4_s).optimize(), attr_map_5_s).optimize() self.rules = { 'жилы': attr_map_0_s, 'соединительные_провода': attr_map_3_s, 'диаметр': attr_map_comp_0, 'длина_кабеля': attr_map_6_s }
def __init__(self): super().__init__() attr_map_0 = pynini.transducer(self.cable_digits, '#жил') attr_map_1 = pynini.transducer(self.cable_digits, 'Сечение_кабеля') attr_map_2 = pynini.transducer(self.cable_digits, 'Длина_кабеля') attr_map_3 = pynini.transducer('.', 'Сечение_кабеля') attr_map_0_rc = self.cable_splitters attr_map_0_lc = pynini.union(" ") attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize() attr_map_1_rc = self.cable_splitters attr_map_1_lc = self.cable_floats attr_map_1_s = pynini.cdrewrite(attr_map_1, attr_map_1_rc, attr_map_1_lc, self.alphabet).optimize() attr_map_2_rc = self.cable_floats attr_map_2_lc = self.cable_length_0 attr_map_2_s = pynini.cdrewrite(attr_map_1, attr_map_2_rc, attr_map_2_lc, self.alphabet).optimize() attr_map_3_rc = self.cable_length_0 attr_map_3_lc = self.cable_length_1 attr_map_3_s = pynini.cdrewrite(attr_map_2, attr_map_3_rc, attr_map_3_lc, self.alphabet).optimize() attr_map_4_rc = self.cable_digits attr_map_4_lc = self.cable_digits attr_map_4_s = pynini.cdrewrite(attr_map_3, attr_map_4_rc, attr_map_4_lc, self.alphabet).optimize() attr_map_5_rc = self.cable_splitters attr_map_5_lc = self.cable_length_0 attr_map_5_s = pynini.cdrewrite(attr_map_1, attr_map_5_rc, attr_map_5_lc, self.alphabet).optimize() attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_1_s, attr_map_2_s).optimize(), attr_map_4_s).optimize() self.rules = { 'жилы': attr_map_0_s, 'сечение_кабеля_0': attr_map_comp_0, 'длина_кабеля': attr_map_3_s, 'сечение_кабеля_1': attr_map_5_s }
def __construct_pref_deriv_suff_stems(self): ''' Derivation suffixes which combine with prefixed stems ''' return pynini.compose( self.__lex, pynini.concat( self.__syms.initial_features.closure(), pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet), pynini.transducer("<prefderiv>", "", input_token_type=self.__syms.alphabet), self.__sigma_star)).optimize()
def __construct_imperative_filter(self): ''' Imperatives have no separable prefixes ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map( [ "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()).optimize() c2 = pynini.union( alphabet, pynini.transducer( self.__syms.stem_types, "<CB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet)).closure().optimize() return pynini.union( c2, pynini.concat( pynini.transducer("<Base_Stems>", "<CB>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), alphabet.closure(), pynini.transducer("<^imp>", "", input_token_type=self.__syms.alphabet), alphabet.closure())).optimize()
def _lexicon_covering( self, tsv_path: str, input_token_type: TokenType, input_epsilon: bool, output_token_type: TokenType, output_epsilon: bool, ) -> None: """Builds covering grammar and lexicon FARs.""" # Sets of labels for the covering grammar. g_labels: Set[int] = set() p_labels: Set[int] = set() # Curries compiler functions for the FARs. icompiler = functools.partial( pynini.acceptor, token_type=input_token_type ) ocompiler = functools.partial( pynini.acceptor, token_type=output_token_type ) logging.info("Constructing grapheme and phoneme FARs") g_writer = pywrapfst.FarWriter.create(self.g_path) p_writer = pywrapfst.FarWriter.create(self.p_path) with open(tsv_path, "r") as source: for (linenum, line) in enumerate(source, 1): key = f"{linenum:08x}" (g, p) = line.rstrip().split("\t", 1) # For both G and P, we compile a FSA, store the labels, and # then write the compact version to the FAR. g_fst = icompiler(g) g_labels.update(g_fst.paths().ilabels()) g_writer[key] = self._compactor(g_fst) p_fst = ocompiler(p) p_labels.update(p_fst.paths().ilabels()) p_writer[key] = self._compactor(p_fst) logging.info("Processed %s examples", f"{linenum:,d}") logging.info("Constructing covering grammar") logging.info("%d unique graphemes", len(g_labels)) g_side = self._label_union(g_labels, input_epsilon) logging.info("%d unique phones", len(p_labels)) p_side = self._label_union(p_labels, output_epsilon) # The covering grammar is given by (G x P)^*. covering = pynini.transducer(g_side, p_side).closure().optimize() assert covering.num_states() == 1, "Covering grammar FST is ill-formed" logging.info( "Covering grammar has %s arcs", f"{PairNGramAligner._narcs(covering):,d}", ) covering.write(self.c_path)
def __suff_stems_filter(self, features): ''' Return a union over filters for each feature given ''' filtering = pynini.Fst() filtering.set_input_symbols(self.__syms.alphabet) filtering.set_output_symbols(self.__syms.alphabet) suff_stems = pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet) for feature in features: to_eps = pynini.transducer(feature, "", input_token_type=self.__syms.alphabet) filtering = pynini.union(filtering, pynini.concat(to_eps, suff_stems, to_eps)) return filtering.optimize()
def __init__(self, syms, lexicon): # # store alphabet self.__syms = syms # # store lexicon self.__lex = lexicon # # (private) helpers self.__sigma_star = pynini.union( syms.characters, syms.categories, syms.stem_types, syms.stem_type_features, syms.origin_features, syms.circumfix_features, syms.inflection_classes, syms.geo_inflection_classes, pynini.acceptor("<ge>", token_type=syms.alphabet ) # for word-internal <ge> (ausgewertet) ).closure().optimize() # # NoDef2NULL self.__nodef_to_null = pynini.union( self.__sigma_star, syms.origin_features, pynini.transducer("<NoDef>", "", input_token_type=self.__syms.alphabet), syms.stem_types).closure().optimize() # # sublexica self.__bdk_stems = self.__construct_bdk_stems() self.__base_stems = self.__construct_base_stems() self.__pref_stems = self.__construct_pref_stems() self.__verbal_pref_stems = self.__construct_verbal_pref_stems() self.__simplex_suff_stems = self.__construct_simplex_suff_stems() self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems() self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems() self.__quant_suff_stems = self.__construct_quant_suff_stems()
def __construct_ge_nom_stems_v(self, tmp): ''' Stems for ge nominalization of verbs ("Gejammer") ''' alphabet = pynini.union( self.__syms.characters, self.__syms.categories, pynini.string_map( ["<CONV>", "<SUFF>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project()) # extract infinitives infinitives = pynini.compose( pynini.concat( pynini.concat( self.__syms.characters.closure(1), pynini.acceptor( "<PREF>", token_type=self.__syms.alphabet)).closure(), alphabet.closure(1), pynini.transducer("", "<+V> <Inf>", output_token_type=self.__syms.alphabet)), tmp).optimize() insert_ge = pynini.concat( pynini.concat( self.__syms.characters.closure(1), pynini.acceptor("<PREF>", token_type=self.__syms.alphabet)).closure(), pynini.transducer("g e <PREF> <Ge>", "", input_token_type=self.__syms.alphabet), alphabet.closure(1)).optimize() inserted_ge = pynini.compose( pynini.compose(insert_ge, infinitives).project(), pynini.union( self.__syms.to_lower, self.__syms.categories, self.__syms.prefix_suffix_marker, pynini.acceptor( "<Ge>", token_type=self.__syms.alphabet)).closure()).optimize() return pynini.concat( pynini.transducer("", "<Deriv_Stems>", output_token_type=self.__syms.alphabet), pynini.compose( pynini.compose( pynini.compose( pynini.union( alphabet, pynini.acceptor("<PREF>", token_type=self.__syms.alphabet), pynini.transducer("", "<Ge>", output_token_type=self.__syms. alphabet)).closure(), inserted_ge), pynini.union( self.__syms.characters, pynini.acceptor("<Ge>", token_type=self.__syms.alphabet), pynini.transducer( pynini.union(self.__syms.categories, self.__syms.prefix_suffix_marker), "")).closure()), pynini.concat( pynini.union( self.__syms.characters, pynini.acceptor("<Ge>", token_type=self.__syms.alphabet), ).closure(1), pynini.transducer("e n", "", input_token_type=self.__syms.alphabet))), pynini.acceptor("<V>", token_type=self.__syms.alphabet), pynini.transducer( "", "<deriv> <nativ>", output_token_type=self.__syms.alphabet)).optimize()
import pynini chars = ([chr(i) for i in range(1, 91)] + ["\\[", "\\]", "\\\\"] + [chr(i) for i in range(94, 256)]) sigma_star = pynini.union(*chars).closure() sigma_star.optimize() input_string = "Do you have Camembert or Edam?" # Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>? cheeses = ("Boursin", "Camembert", "Cheddar", "Edam", "Gruyere", "Ilchester", "Jarlsberg", "Red Leicester", "Stilton") output_string = "Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>" fst_target = pynini.string_map(cheeses) ltag = pynini.transducer("", "<cheese>") rtag = pynini.transducer("", "</cheese>") substitution = ltag + fst_target + rtag rewrite = pynini.cdrewrite(substitution, "", "", sigma_star) output = pynini.compose(input_string, rewrite).stringify() ####################################################################################################################### singular_map = pynini.union( pynini.transducer("feet", "foot"), pynini.transducer("pence", "penny"), # Any sequence of bytes ending in "ches" strips the "es"; # the last argument -1 is a "weight" that gives this analysis a higher priority, if it matches the input. sigma_star + pynini.transducer("ches", "ch", -1), # Any sequence of bytes ending in "s" strips the "s".
#measure back_vowel = pynini.union("u", "o", "a") neutral_vowel = pynini.union("i", "e") front_vowel = pynini.union("y", "ö", "ä") vowel = pynini.union(back_vowel, neutral_vowel, front_vowel) archiphoneme = pynini.union("A", "I", "E", "O", "U") consonant = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z") sigma_star = pynini.union(vowel, consonant, archiphoneme).closure().optimize() adessive = "llA" intervener = pynini.union(consonant, neutral_vowel).closure() adessive_harmony = ( pynini.cdrewrite(pynini.transducer("A", "a"), back_vowel + intervener, "", sigma_star) * pynini.cdrewrite(pynini.t("A", "ä"), "", "", sigma_star)).optimize() def make_adessive(stem): return ((stem + adessive) * adessive_harmony).stringify() make_adessive("training") singular_map = pynini.union( pynini.transducer("feet", "foot"), pynini.transducer("pence", "penny"), # Any sequence of bytes ending in "ches" strips the "es"; # the last argument -1 is a "weight" that gives this analysis
("ä", "A"), ("ö", "O"), ("š", "S") ]) | pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z", "u", "o", "a", "y", "i", "e", "-")).closure().optimize() rvregularize = (pynini.string_map([ ("A", "ä"), ("O", "ö"), ("S", "š") ]) | pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z", "u", "o", "a", "y", "i", "e", "-")).closure().optimize() ######################FST for harmony in suffix#################################################### regular_state = closure_regular.optimize() harmony_state = closure_harmony.optimize() adessive_regular_transduce = pynini.transducer( "", adessive_regular) #, output_token_type="utf8") adessive_harmony_transduce = pynini.transducer( "", adessive_harmony) #, output_token_type="utf8") inessive_regular_transduce = pynini.transducer( "", inessive_regular) #, output_token_type="utf8") inessive_harmony_transduce = pynini.transducer( "", inessive_harmony) #, output_token_type="utf8") transducer_adessive_harmony = harmony_state + adessive_harmony_transduce transducer_adessive_regular = regular_state + adessive_regular_transduce transducer_inessive_harmony = harmony_state + inessive_harmony_transduce transducer_inessive_regular = regular_state + inessive_regular_transduce transducer_adessive_base = transducer_adessive_regular | transducer_adessive_harmony transducer_inessive_base = transducer_inessive_regular | transducer_inessive_harmony
def __construct_participle_adj(self, tmp, sublexica): ''' Stems for conversion of participles into adjectives ''' alphabet = pynini.union( self.__syms.characters, pynini.string_map([ "<VPART>", "<VPREF>", "<PREF>", "<CONV>", "<SUFF>", "<NN>", "<ADJ>", "<V>", "<FT>" ], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project( )).closure().optimize() return pynini.concat( pynini.transducer("", "<Base_Stems>", output_token_type=self.__syms.alphabet), pynini.union( pynini.concat( pynini.compose( pynini.concat( alphabet, pynini.transducer( "<V>", "<+V>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), pynini.acceptor( "<zu>", token_type=self.__syms.alphabet).closure(0, 1), pynini.acceptor("<PPast>", token_type=self.__syms.alphabet)), pynini.compose( tmp, pynini.concat( sublexica.nodef_to_null, pynini.acceptor( "t", token_type=self.__syms.alphabet)))), pynini.transducer("", "<ADJ>", output_token_type=self.__syms.alphabet), pynini.transducer("<CONV>", "", input_token_type=self.__syms.alphabet), pynini.transducer("", "<base> <nativ> <Adj+e>", output_token_type=self.__syms.alphabet)), pynini.concat( pynini.compose( pynini.concat( alphabet, pynini.transducer( "<V>", "<+V>", input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), pynini.acceptor( "<zu>", token_type=self.__syms.alphabet).closure(0, 1), pynini.string_map( ["<PPast>", "<PPres>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet). project()), pynini.compose( tmp, pynini.concat( sublexica.nodef_to_null, pynini.acceptor( "e n", token_type=self.__syms.alphabet) | pynini.acceptor( "n d", token_type=self.__syms.alphabet)))), pynini.transducer("", "<ADJ>", output_token_type=self.__syms.alphabet), pynini.transducer("<CONV>", "", input_token_type=self.__syms.alphabet), pynini.transducer( "", "<base> <nativ> <Adj+>", output_token_type=self.__syms.alphabet)))).optimize()
#!/usr/bin/pytho ###Improved T9 decoder that is biased towards word strings due to intersection with word trasducer #### #####To execute, run executable with either arguments (str) or (str, str)) ####Feeding only a single argument will run decoder function as if string is T9 encoded ####Feeding two strings, with second str = "e" will run a T9 encoded on first string and return encoded function. import pynini import string import sys ##Vocabulary lm_char = pynini.Fst.read("t9.char.lm.4") lm_word = pynini.Fst.read("t9.word.lm") t9 = pynini.transducer("0", "[32]") t9_relations = [ "0", "1", "2abc", "3def", "4ghi", "5jkl", "6mno", "7pqrs", "8tuv", "9wxyz" ] ##Reading vocabulary into alphabet. for i in range(10): for k in t9_relations[i]: t9 = pynini.union(pynini.transducer(str(i), str(k)), t9) ##Adding punctuation to vocabulary for i in string.punctuation: t9 = t9 | pynini.transducer("1", "[" + str(ord(i)) + "]") ##Closure and optimization t9.closure().optimize() ##Inverstion for decoding encoder = pynini.invert(t9).optimize()