def __init__(self, changes_path, inventory): self.inventory = inventory self.insertion_count = 0 pynini.default_token_type("utf-8") self.sigma_star = pynini.closure( pynini.union(*self.inventory.syms.union("#"))).optimize() self.formula = self.load_sound_changes(changes_path)
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FSTs for reading normalization of Brahmic scripts.""" for token_type in ('byte', 'utf8'): rewrite_map = {} with pynini.default_token_type(token_type): sigma_map = {} scripts = set(u.READING_NORM_SCRIPTS) scripts.update(u.READING_NORM_LANG_SCRIPT_MAP) for script in scripts: sigma = u.OpenSigma(script, token_type) sigma_map[script] = sigma rewrite_map[script] = _reading_norm_fst( u.SCRIPT_DIR, script, sigma) for script, langs in u.READING_NORM_LANG_SCRIPT_MAP.items(): for lang in langs: sigma = sigma_map[script] rewrite_map[lang] = rewrite.ComposeFsts([ rewrite_map[script], _reading_norm_fst(u.SCRIPT_DIR / script, lang, sigma), ]) exporter = exporter_map[token_type] for name, fst in rewrite_map.items(): exporter[name.upper()] = fst
def __construct_suff_phon(self): ''' ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"), self.__syms.stem_types, ).closure() Tau = pynini.cross("i", "") Lambda = pynini.concat( pynini.union( pynini.accep("i"), pynini.concat( self.__syms.consonants.project("input"), pynini.accep("y") ) ), pynini.accep("<Suff_Stems>") ) return pynini.concat( pynini.cdrewrite( Tau, Lambda, "", alphabet.project("input") ), self.__tail ).optimize()
def generator_main(exporter_map: multi_grm.ExporterMapping): """FSTs for ISO conversion of fixed rule romanization of Brahmic.""" for token_type in ('byte', 'utf8'): with pynini.default_token_type(token_type): exporter = exporter_map[token_type] for script in u.FIXED_RULE_SCRIPTS: exporter[f'{script.upper()}'] = _fixed_rule_fst(script)
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FAR for multilingual phonological operations.""" for token_type in ('byte', 'utf8'): with p.default_token_type(token_type): exporter = exporter_map[token_type] exporter['VOICING'] = INTERSONORANT_VOICING exporter[ 'ANUSVARA_ASSIMILATION_LABIAL'] = ANUSVARA_ASSIMILATION_LABIAL exporter[ 'ANUSVARA_ASSIMILATION_DENTAL'] = ANUSVARA_ASSIMILATION_DENTAL exporter[ 'ANUSVARA_ASSIMILATION_ALVEOLAR'] = ANUSVARA_ASSIMILATION_ALVEOLAR exporter[ 'ANUSVARA_ASSIMILATION_PALATAL'] = ANUSVARA_ASSIMILATION_PALATAL exporter[ 'ANUSVARA_ASSIMILATION_RETROFLEX'] = ANUSVARA_ASSIMILATION_RETROFLEX exporter[ 'ANUSVARA_ASSIMILATION_VELAR'] = ANUSVARA_ASSIMILATION_VELAR exporter['ANUSVARA_ASSIMILATION'] = ANUSVARA_ASSIMILATION exporter['DEFAULT_ANUSVARA_DENTAL'] = DEFAULT_ANUSVARA_DENTAL exporter['DEFAULT_ANUSVARA_LABIAL'] = DEFAULT_ANUSVARA_LABIAL exporter[ 'FINAL_ANUSVARA_NASALIZATION'] = FINAL_ANUSVARA_NASALIZATION exporter['JNY_TO_GY'] = JNY_TO_GY exporter['JNY_TO_NY'] = JNY_TO_NY
def __construct_compound_filter(self): ''' Construct the compound filter ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"), self.__syms.stem_types, pynini.cross(self.__syms.categories, ""), pynini.cross(self.__syms.origin_features, ""), pynini.cross("<NoPref>", "") ) return pynini.concat( pynini.union( pynini.cross("<Initial>", ""), pynini.accep("<NoHy>"), pynini.accep("<NoDef>") ).closure(0,1), pynini.concat( pynini.union( pynini.concat( alphabet.closure(), pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "") ), pynini.concat( pynini.cross("", "<VADJ>"), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.concat( pynini.cross("<kompos>", ""), pynini.concat( alphabet.closure(), pynini.cross("<V>", "") ) ) ) ), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "") ) ), pynini.concat( pynini.cross("<base>", ""), pynini.concat( pynini.cross(self.__syms.origin_features, ""), self.__syms.inflection_classes ) ) ) ).optimize()
def __construct_insert_zu(self): ''' Inserts "zu" into infinitives with separable prefixes ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input") ).optimize() c2 = pynini.union( alphabet, self.__syms.stem_types ).closure().optimize() # From deko.fst: # insert "zu" after verbal prefixes if followed by infinitive marker return pynini.union( c2, #pynini.concat( # pynini.accep("<Base_Stems>"), # alphabet.closure(), # pynini.cross("<^zz>", ""), # alphabet.closure() # ), c2 + pynini.accep("<Pref_Stems>") + alphabet.closure() + pynini.accep("<Base_Stems>") + pynini.cross("", "z u") + alphabet.closure() + pynini.cross("<^zz>", "") + alphabet.closure() ).optimize()
def rewrite_lattice( string: pynini.FstLike, rule: pynini.Fst, token_type: Optional[pynini.TokenType] = None) -> pynini.Fst: """Constructs a weighted lattice of output strings. Constructs a weighted, epsilon-free lattice of output strings given an input FST (or string) and a rule FST. Args: string: Input string or FST. rule: Input rule WFST. token_type: Optional input token type, or symbol table. Returns: An epsilon-free WFSA. Raises: Error: Composition failure. """ # TODO(kbg): Consider adding support for PDT and MPDT composition. # TODO(kbg): Consider using `contextlib.nullcontext` here instead. if token_type is None: lattice = pynini.compose(string, rule, compose_filter="alt_sequence") else: with pynini.default_token_type(token_type): lattice = pynini.compose(string, rule, compose_filter="alt_sequence") if lattice.start() == pynini.NO_STATE_ID: raise Error("Composition failure") return lattice.project("output").rmepsilon()
def matches(self, istring: pynini.FstLike, ostring: pynini.FstLike, input_token_type: Optional[pynini.TokenType] = None, output_token_type: Optional[pynini.TokenType] = None) -> bool: """Returns whether or not the rule cascade allows an input/output pair. Args: istring: Input string or FST. ostring: Output string or FST. input_token_type: Optional input token type, or symbol table. output_token_type: Optional output token type, or symbol table. Returns: Whether the input-output pair is generated by the rule. """ lattice = self._rewrite_lattice(istring, input_token_type) # TODO(kbg): Consider using `contextlib.nullcontext` here instead. if output_token_type is None: lattice = pynini.intersect(lattice, ostring, compose_filter="sequence") else: with pynini.default_token_type(output_token_type): lattice = pynini.intersect(lattice, ostring, compose_filter="sequence") return lattice.start() != pynini.NO_STATE_ID
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FAR for ISO char to PSA phoneme assignment.""" for token_type in ('byte', 'utf8'): with p.default_token_type(token_type): exporter = exporter_map[token_type] exporter['TYP_TO_TXN'] = TYP_TO_TXN
def __construct_inflection(self): ''' Build the inflection cross ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.union( pynini.concat(pynini.cross("", "<Adj0>"), self.__adj0), pynini.concat(pynini.cross("", "<Adj0-Up>"), self.__adj0_up), pynini.concat(pynini.cross("", "<Adj+>"), self.__adj_plus), pynini.concat(pynini.cross("", "<Adj+e>"), self.__adj_plus_e), pynini.concat(pynini.cross("", "<NMasc_es_e>"), self.__nmasc_es_e), pynini.concat(pynini.cross("", "<NMasc_es_$e>"), self.__nmasc_es_e_ul), pynini.concat(pynini.cross("", "<NMasc_es_en>"), self.__nmasc_es_en), pynini.concat(pynini.cross("", "<NFem-Deriv>"), self.__nfem_deriv), pynini.concat(pynini.cross("", "<NFem_0_n>"), self.__nfem_0_n), pynini.concat(pynini.cross("", "<NNeut-Dimin>"), self.__nneut_dimin), pynini.concat(pynini.cross("", "<NNeut/Sg_s>"), self.__nneut_sg_s), pynini.concat(pynini.cross("", "<VVReg>"), self.__vv_reg)).optimize()
def __construct_tail(self): ''' Define possible final sequences of a derivation ''' with pynini.default_token_type(self.__syms.alphabet): # C1 initial_stuff = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<Pref_Stems>"]).project("input") ).closure() # C2 intermediate_stuff = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Suff_Stems>"]).project("input") ).closure() # C3 final_stuff = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>"]).project("input"), self.__syms.categories, self.__syms.stem_type_features, self.__syms.origin_features, pynini.string_map(["<NSNeut_es_e>", "<NSFem_0_n>", "<NSFem_0_en>", "<NSMasc_es_e>", "<NSMasc_es_$e>", "<NSMasc-s/$sse>"]).project("input") ).closure() # TAIL tail1 = initial_stuff + self.__syms.base_stem_types + intermediate_stuff return pynini.concat(tail1.closure(0,1) + final_stuff, self.__syms.inflection_classes.closure(0,1)).optimize()
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' with pynini.default_token_type(self.__syms.alphabet): kompos_stems = pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Sg>")))), pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Pl>")))))), tmp) return (pynini.cross("", "<Kompos_Stems>") + kompos_stems + pynini.accep("<NN>") + pynini.cross("", "<kompos> <nativ>")).optimize()
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FAR for language agnostic ISO to typeable string conversion.""" for token_type in ('byte', 'utf8'): with p.default_token_type(token_type): exporter = exporter_map[token_type] exporter['ISO_TO_TYP_DECOMPOSED'] = _iso_to_decomposed_typ() exporter['ISO_TO_TYP'] = iso_to_typ()
def generator_main(exporter: grm.Exporter): """Generate FSAs accepting the alphabet of each Brahmic script.""" # NOTE: It isn't useful for us to create a byte-mode sigma, so only export # utf8-mode sigma. with pynini.default_token_type('utf8'): for script in u.SCRIPTS: chars = cu.script_chars(script) exporter[script.upper()] = uc.derive_sigma(chars)
def generator_main(exporter_map: multi_grm.ExporterMapping): """FSTs for language-agnostic NFC normalization of abjad / alphabet script text.""" for token_type in ('byte', 'utf8'): with pynini.default_token_type(token_type): sigma = u.sigma_from_common_data_files() mapping_file = u.LANG_DIR / 'nfc.tsv' mapping = rule.fst_from_rule_file(mapping_file, sigma) exporter = exporter_map[token_type] exporter[u.SCRIPT_NAME.upper()] = mapping
def __construct_bdk_stems(self): ''' Base, derivation and compound stems (without derivation suffixes) ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.compose( self.__lex, self.__syms.initial_features.closure() + pynini.string_map(["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"]).project("input") + self.__sigma_star ).optimize()
def __construct_base_stems(self): ''' Base stems ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.compose( self.__bdk_stems, self.__syms.initial_features.closure() + pynini.accep("<Base_Stems>") + self.__sigma_star ).optimize()
def __construct_pref_stems(self): ''' Prefix stems ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.compose( self.__lex, self.__syms.initial_features.closure() + pynini.accep("<Pref_Stems>") + self.__sigma_star ).optimize()
def __split_disjunctive_feats(self, disjunctive_feat_list): with pynini.default_token_type(self.__syms.alphabet): single_splits = [] for disjunctive_feat in disjunctive_feat_list: splitted = [] for cat in disjunctive_feat[1:-1].split(","): splitted.append("<" + cat + ">") single_splits.append( pynini.cross(disjunctive_feat, pynini.string_map(splitted))) return pynini.union(*(single_splits)).optimize()
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FAR for natural transliteration for Malayalam.""" for token_type in ('byte', 'utf8'): with p.default_token_type(token_type): iso_to_txn = ( iso.iso_to_typ() @ typ.TYP_TO_TXN @ ops.DEFAULT_ANUSVARA_LABIAL @ ops.INTERSONORANT_VOICING @ ops.JNY_TO_NY) exporter = exporter_map[token_type] exporter['ISO_TO_PSAF'] = (iso_to_txn @ txn.TXN_TO_PSAF).optimize() exporter['ISO_TO_PSAC'] = (iso_to_txn @ txn.TXN_TO_PSAC).optimize()
def __construct_pref_deriv_suff_stems(self): ''' Derivation suffixes which combine with prefixed stems ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.compose( self.__lex, self.__syms.initial_features.closure() + pynini.accep("<Suff_Stems>") + pynini.cross("<prefderiv>", "") + self.__sigma_star ).optimize()
def generator_main(exporter: grm.Exporter, token_type: pynini.TokenType): """FSTs for reading normalization of abjad / alphabet script languages.""" with pynini.default_token_type(token_type): sigma = u.sigma_from_common_data_files() for lang in u.LANGS: visual_norm_fst = _open_visual(lang, token_type) reading_norm_file = u.LANG_DIR / lang / 'reading_norm.tsv' reading_norm_fst = rule.fst_from_rule_file(reading_norm_file, sigma) lang = lang.upper() exporter[lang] = pynini.optimize( visual_norm_fst @ reading_norm_fst)
def __construct_rep_pref(self): ''' Replace the marker of manually prefixed stems ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.cdrewrite( pynini.cross("<prefnativ>", "<nativ>"), "", "", self.__prefix_filter_helper ).optimize()
def __construct_prefix_filter_helper(self): ''' Alphabet for the prefix filter ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"), self.__syms.stem_types, self.__syms.categories, ).closure().optimize()
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FAR for natural transliteration.""" for token_type in ('byte', 'utf8'): with p.default_token_type(token_type): iso_to_txn = ( iso.iso_to_typ() @ typ.TYP_TO_TXN @ ops.ANUSVARA_ASSIMILATION @ ops.DEFAULT_ANUSVARA_DENTAL @ ops.FINAL_ANUSVARA_NASALIZATION @ ops.JNY_TO_GY) exporter = exporter_map[token_type] exporter['ISO_TO_PSAF'] = (iso_to_txn @ txn.TXN_TO_PSAF).optimize() exporter['ISO_TO_PSAC'] = (iso_to_txn @ txn.TXN_TO_PSAC).optimize()
def __construct_quant_suff_stems(self): ''' Derivation suffixes which combine with a number and a simplex stem ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.compose( self.__lex, pynini.cross("<QUANT>", "") + self.__syms.initial_features.closure() + pynini.accep("<Suff_Stems>") + pynini.cross("<simplex>", "") + self.__sigma_star ).optimize()
def __construct_ge_nom_stems_v(self, tmp): ''' Stems for ge nominalization of verbs ("Gejammer") ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, self.__syms.categories, pynini.string_map(["<CONV>", "<SUFF>"]).project("input")) # extract infinitives infinitives = pynini.compose( pynini.concat( pynini.concat(self.__syms.characters.closure(1), pynini.accep("<PREF>")).closure(), pynini.concat(alphabet.closure(1), pynini.cross("", "<+V> <Inf>"))), tmp).optimize() insert_ge = pynini.concat( pynini.concat(self.__syms.characters.closure(1), pynini.accep("<PREF>")).closure(), pynini.concat(pynini.cross("g e <PREF> <Ge>", ""), alphabet.closure(1))).optimize() inserted_ge = pynini.compose( pynini.compose(insert_ge, infinitives).project("input"), pynini.union(self.__syms.to_lower, self.__syms.categories, self.__syms.prefix_suffix_marker, pynini.accep("<Ge>")).closure()).optimize() deriv_stem_filter_ge = pynini.compose( pynini.compose( pynini.compose( pynini.union(alphabet, pynini.accep("<PREF>"), pynini.cross("", "<Ge>")).closure(), inserted_ge), pynini.union( self.__syms.characters, pynini.accep("<Ge>"), pynini.cross( pynini.union(self.__syms.categories, self.__syms.prefix_suffix_marker), "")).closure()), pynini.concat( pynini.union( self.__syms.characters, pynini.accep("<Ge>"), ).closure(1), pynini.cross("e n", ""))).optimize() return (pynini.cross("", "<Deriv_Stems>") + deriv_stem_filter_ge + pynini.accep("<V>") + pynini.cross("", "<deriv> <nativ>")).optimize()
def __construct_umlautung(self): ''' Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present. ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"), self.__syms.stem_types, self.__syms.categories, ).closure() return pynini.concat( pynini.concat( alphabet, pynini.concat( self.__syms.consonants, pynini.concat( pynini.union( pynini.union( pynini.cross("a", "ä"), pynini.cross("o", "ö"), pynini.cross("u", "ü") ), pynini.concat( pynini.cross("a", "ä"), pynini.union( pynini.cross("a", ""), pynini.accep("u") ) ) ), pynini.concat( self.__syms.consonants.closure(), pynini.concat( pynini.concat( pynini.accep("e"), pynini.string_map(["l", "r"]).project("input") ).closure(0, 1), pynini.concat( pynini.accep("<Suff_Stems>"), pynini.cross("<UL>", "") ) ) ) ) ).closure(0, 1) ), self.__tail ).optimize()
def assert_fst_functional(fst: pynini.Fst, token_type: pynini.TokenType, string_fsa: pynini.Fst) -> None: """Assert that an FST is funcional for the given string FSA. Args: fst: An FST to verify if is functional. token_type: The token_type used to derive the Fst. string_fsa: The string FSA to verify functional behavior. Raises: AssertionError: If the FST is found to have a non-functional. """ with pynini.default_token_type(token_type): verify_if_single_path(string_fsa, string_fsa @ fst)