def generate_fst_for_factor_digit(factor, include_zero=False): fst = pn.Fst() carets = '' if factor > 0: carets = '^' * factor carets = carets + ' ' for num in range(0, 10): # if num == 0 and include_zero is False: # fst_temp = pn.t(str(num), "") # else: # fst_temp = pn.t(str(num), str(num) + carets) fst_temp = pn.t(str(num), str(num) + carets) fst = pn.union(fst, fst_temp) fst = fst.optimize() return fst
back_vowel = pynini.union("u", "o", "a") neutral_vowel = pynini.union("i", "e") front_vowel = pynini.union("y", "ö", "ä") vowel = pynini.union(back_vowel, neutral_vowel, front_vowel) archiphoneme = pynini.union("A", "I", "E", "O", "U") consonant = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z") sigma_star = pynini.union(vowel, consonant, archiphoneme).closure().optimize() adessive = "llA" intervener = pynini.union(consonant, neutral_vowel).closure() adessive_harmony = ( pynini.cdrewrite(pynini.transducer("A", "a"), back_vowel + intervener, "", sigma_star) * pynini.cdrewrite(pynini.t("A", "ä"), "", "", sigma_star)).optimize() def make_adessive(stem): return ((stem + adessive) * adessive_harmony).stringify() make_adessive("training") singular_map = pynini.union( pynini.transducer("feet", "foot"), pynini.transducer("pence", "penny"), # Any sequence of bytes ending in "ches" strips the "es"; # the last argument -1 is a "weight" that gives this analysis # a higher priority, if it matches the input. sigma_star + pynini.transducer("ches", "ch", -1),
def n2w_fst(): factor_fst = generate_fst_digit() # full french alphabet - https://en.wikiversity.org/wiki/French/Alphabet alphabet_full = pn.u( *".0123456789^ _-abcdefghijklmnopqrstuvwxyzàèùéâêîôûëïüÿæœç").star fsa_0_9 = pn.u(*"0123456789").star # single_zero = pn.t("0", "zéro") single_zero = pn.t("0", "zero") single_digits = pn.string_map({ "0": "", # zéro "1": "un", "2": "deux", "3": "trois", "4": "quatre", "5": "cinq", "6": "six", "7": "sept", "8": "huit", "9": "neuf", }) zeros = pn.string_map({ # "0^^ 0^ 0": "", "0^ ": "", "0^^ ": "", "mille_0^^ 0^ 0": "mille", }) teens_10_19 = pn.string_map({ "1^ 0": "dix", "1^ 1": "onze", "1^ 2": "douze", "1^ 3": "treize", "1^ 4": "quatorze", "1^ 5": "quinze", "1^ 6": "seize", "1^ 7": "dix-sept", "1^ 8": "dix-huit", "1^ 9": "dix-neuf", }) mult_20_60 = pn.string_map({ "2^ 0": "vingt", "2^ 1": "vingt_et_un", "3^ 0": "trente", "3^ 1": "trente_et_un", "4^ 0": "quarante", "4^ 1": "quarante_et_un", "5^ 0": "cinquante", "5^ 1": "cinquante_et_un", "6^ 0": "soixante", "6^ 1": "soixante_et_un", }) mult_2x_6x = pn.string_map({ "2^ ": "vingt-", "3^ ": "trente-", "4^ ": "quarante-", "5^ ": "cinquante-", "6^ ": "soixante-", }) mult_70_90 = pn.string_map({ "7^ 0": "soixante-dix", "7^ 1": "soixante_et_onze", "7^ 2": "soixante-douze", "7^ 3": "soixante-treize", "7^ 4": "soixante-quatorze", "7^ 5": "soixante-quinze", "7^ 6": "soixante-seize", "7^ 7": "soixante-dix-sept", "7^ 8": "soixante-dix-huit", "7^ 9": "soixante-dix-neuf", "8^ 0": "quatre-vingts", "9^ 0": "quatre-vingt-dix", "9^ 1": "quatre-vingt-onze", "9^ 2": "quatre-vingt-douze", "9^ 3": "quatre-vingt-treize", "9^ 4": "quatre-vingt-quatorze", "9^ 5": "quatre-vingt-quinze", "9^ 6": "quatre-vingt-seize", "9^ 7": "quatre-vingt-dix-sept", "9^ 8": "quatre-vingt-dix-huit", "9^ 9": "quatre-vingt-dix-neuf", }) mult_8x = pn.string_map({ "8^ ": "quatre-vingt-", }) hundreds_alone = pn.string_map({ "1^^ 0^ 0": "cent", "2^^ 0^ 0": "deux_cents", "3^^ 0^ 0": "trois_cents", "4^^ 0^ 0": "quatre_cents", "5^^ 0^ 0": "cinq_cents", "6^^ 0^ 0": "six_cents", "7^^ 0^ 0": "sept_cents", "8^^ 0^ 0": "huit_cents", "9^^ 0^ 0": "neuf_cents", }) hundreds = pn.string_map({ "1^^ ": "cent_", "2^^ ": "deux_cent_", "3^^ ": "trois_cent_", "4^^ ": "quatre_cent_", "5^^ ": "cinq_cent_", "6^^ ": "six_cent_", "7^^ ": "sept_cent_", "8^^ ": "huit_cent_", "9^^ ": "neuf_cent_", }) mille = pn.string_map({ "0^^^ ": "0^^^_mille_", "1^^^ ": "1^^^_mille_", "2^^^ ": "2^^^_mille_", "3^^^ ": "3^^^_mille_", "4^^^ ": "4^^^_mille_", "5^^^ ": "5^^^_mille_", "6^^^ ": "6^^^_mille_", "7^^^ ": "7^^^_mille_", "8^^^ ": "8^^^_mille_", "9^^^ ": "9^^^_mille_", }) million = pn.string_map({ "0^^^^^^ ": "0^^^^^^_millions_", "1^^^^^^ ": "1^^^^^^_millions_", "2^^^^^^ ": "2^^^^^^_millions_", "3^^^^^^ ": "3^^^^^^_millions_", "4^^^^^^ ": "4^^^^^^_millions_", "5^^^^^^ ": "5^^^^^^_millions_", "6^^^^^^ ": "6^^^^^^_millions_", "7^^^^^^ ": "7^^^^^^_millions_", "8^^^^^^ ": "8^^^^^^_millions_", "9^^^^^^ ": "9^^^^^^_millions_", }) strip_triple_factor = pn.string_map({ "^^^^^^^^": "^^", "^^^^^^^": "^", "^^^^^^": "", "^^^^^": "^^", "^^^^": "^", "^^^": "", }) un_mille_million = pn.string_map({ "un_mille": "mille", "un_millions": "un_million", }) fixmeup = pn.string_map({ # "zzzzz" : "xxxxxx", "_cent__millions__mille": "_cents_millions", "millions_un_mille": "millions_mille", # "million--mille": "million", "millions__mille": "millions", "vingts_mille": "vingt_mille", "cent__mille": "cent_mille", }) fixmeup2 = pn.string_map({ "million__mille": "million", "_cent__millions": "_cents_millions", "million_un_mille": "million_mille", "__": "_", }) decimals = pn.string_map({ # "0": "zéro ", # zéro "0": "zero ", # zéro "1": "un ", "2": "deux ", "3": "trois ", "4": "quatre ", "5": "cinq ", "6": "six ", "7": "sept ", "8": "huit ", "9": "neuf ", "_": " ", }) fsa_eos = pn.a("[EOS]") fsa_bos = pn.a("[BOS]") fsa_dot_comma = pn.u(".", ",") fst_dot_comma = pn.cdrewrite( pn.u(pn.t(".", " virgule "), pn.t(",", " virgule ")), "", "", alphabet_full) fst_decimals = pn.cdrewrite(decimals, "", "", alphabet_full) fst_zeros = pn.cdrewrite(zeros, "", fsa_0_9 | fsa_eos | fsa_dot_comma, alphabet_full) fst_single_zero = pn.cdrewrite(single_zero, "", fsa_eos | fsa_dot_comma, alphabet_full) fst_single_digits = pn.cdrewrite(single_digits, "", pn.u(fsa_eos, "-", "_", fsa_dot_comma), alphabet_full) fst_teens = pn.cdrewrite(teens_10_19, "", "", alphabet_full) fst_mult_20_60 = pn.cdrewrite(mult_20_60, "", "", alphabet_full) fst_mult_2x_6x = pn.cdrewrite(mult_2x_6x, "", fsa_0_9, alphabet_full) fst_mult_70_90 = pn.cdrewrite(mult_70_90, "", "", alphabet_full) fst_mult_8x = pn.cdrewrite(mult_8x, "", fsa_0_9, alphabet_full) fst_hundreds_alone = pn.cdrewrite(hundreds_alone, "", fsa_eos, alphabet_full) fst_hundreds = pn.cdrewrite(hundreds, "", fsa_0_9, alphabet_full) fst_mille = pn.cdrewrite(mille, "", fsa_0_9, alphabet_full) fst_million = pn.cdrewrite(million, "", fsa_0_9, alphabet_full) fst_strip_triple_factor = pn.cdrewrite(strip_triple_factor, fsa_0_9, pn.u(" ", "-", "_"), alphabet_full) fst_un_mille_million = pn.cdrewrite(un_mille_million, fsa_bos, "", alphabet_full) fst_fixmeup = pn.cdrewrite(fixmeup, "", "", alphabet_full) fst_fixmeup2 = pn.cdrewrite(fixmeup2, "", "", alphabet_full) fst = factor_fst * fst_million * fst_mille * fst_strip_triple_factor * \ fst_hundreds_alone * fst_hundreds * \ fst_mult_70_90 * fst_mult_8x * fst_mult_20_60 * fst_mult_2x_6x * \ fst_teens * fst_zeros * fst_single_zero * fst_single_digits * \ fst_un_mille_million * fst_fixmeup * fst_fixmeup2 * \ fst_dot_comma * fst_decimals transformer = fst.optimize() ## ---------- YOUR PART ENDS------------ return transformer
def future_perfective(stem): """Transducer for first-person singular future tense of perfective verbs""" vowels = pynini.union("а", "е", "ё", "и", "о", "у", "ы", "э", "ю", "я") yer = pynini.union("ь", "ъ") consonants = pynini.union("б", "в", "г", "д", "ж", "з", "й", "к", "л", "м", "н", "п", "р", "с", "т", "ф", "х", "ц", "ч", "ш", "щ") sigma_star = pynini.union(vowels, consonants, yer).closure() # Define rules for a 1SG future tense inflection future_tense_map = pynini.union( #Consonant mutation cases as mentioned in Wade, 2010 # т : ч pynini.cdrewrite(pynini.t("тать", "чу"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("тить", "чу"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("теть", "чу"), "", "", sigma_star) * # д : ж pynini.cdrewrite(pynini.t("деться", "жусь"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("дить", "жу"), "", "", sigma_star) * # в : вл pynini.cdrewrite(pynini.t("вить", "влю"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("виться", "влюсь"), "", "", sigma_star) * # c : ш pynini.cdrewrite(pynini.t("саться", "шусь"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("сить", "шу"), "", "", sigma_star) * # м : мл pynini.cdrewrite(pynini.t("мить", "млю"), "", "", sigma_star) * # б : бл pynini.cdrewrite(pynini.t("бить", "блю"), "", "", sigma_star) * # п : пл pynini.cdrewrite(pynini.t("пать", "плю"), "", "", sigma_star) * #Consonant mutation cases not mentioned in Wade, 2010 # ч : к (Wade, 2010 к : ч) pynini.cdrewrite(pynini.t("речь", "реку"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("чься", "кусь"), "", "", sigma_star) * # ч : г pynini.cdrewrite(pynini.t("ечь", "ягу"), "", "", sigma_star) * # х : д pynini.cdrewrite(pynini.t("хать", "ду"), "", "", sigma_star) * # c : д pynini.cdrewrite(pynini.t("сть", "ду"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("стить", "щу"), "", "", sigma_star) * #First singular form of future with ю pynini.cdrewrite(pynini.t("ить", "ю"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("тать", "таю"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("ртеть", "ртею"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("мыть", "мою"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("еть", "ею"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("ртеть", "ртею"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("лать", "лаю"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("питать", "питаю"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("меть", "мею"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("лоть", "лю"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("отлить", "отолью"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("ли", "лю"), "", "", sigma_star) * # Mutation with a soft sign pynini.cdrewrite(pynini.t("шить", "шью"), "", "", sigma_star) * #Spelling rule: у instead of ю after sibilants ж, ч, ш, щ pynini.cdrewrite(pynini.t("щи", "щу"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("жить", "жу"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("зить", "жу"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("чить", "чу"), "", "", sigma_star) * #Future with reflexive suffix pynini.cdrewrite(pynini.t("ся", "сь"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("иться", "юсь"), "", "", sigma_star) * pynini.cdrewrite(pynini.t("аться", "усь"), "", "", sigma_star) * #Stem change (vowel deletion) pynini.cdrewrite(pynini.t("тереть", "тру"), "", "", sigma_star) * #Verbs ending in -дать form future with -м pynini.cdrewrite(pynini.t("дать", "дам"), "", "", sigma_star) * #Deletion of two last letters in the infinitive stem pynini.cdrewrite(pynini.t("ть", ""), "", "", sigma_star), ).optimize() return (stem * future_tense_map).stringify()
sigma = pynini.union(coptic_sigma, latin_sigma, punct_whitespace_sigma, vowels, ipa_sigma, wb) sigma_star = pynini.closure(sigma) #rules insert_wb = pynini.transducer("", "[WB]") # pynini.t("", "[WB]") + sigma_star + pynini.t("", "[WB]") # Add WB when coptic letters are on the left and whitespace or punctuation are on the right rule_addwb_1 = pynini.cdrewrite(insert_wb, coptic_sigma, punct_whitespace_sigma, sigma_star) # Add WB when whitespace or punctuation are on the left and coptic letters are on the right rule_addwb_2 = pynini.cdrewrite(insert_wb, punct_whitespace_sigma, coptic_sigma, sigma_star) rule_removewb = pynini.cdrewrite(pynini.t("[WB]", ""), "", "", sigma_star) #alpha alphatoa_1 = pynini.transducer("ⲁ", "æ") rule_1 = pynini.cdrewrite(alphatoa_1, "", "ⲥ[WB]", sigma_star) alphatoa_2 = pynini.transducer("ⲁ", "ə") rule_2 = pynini.cdrewrite(alphatoa_2, "", wb, sigma_star) alphatoa_3 = pynini.transducer("ⲁ", "ɛ") ###rule_3### alphatoa_4 = pynini.transducer("ⲁ", "ɑː") rule_4 = pynini.cdrewrite(alphatoa_4, "", "", sigma_star)
# map_10_to_19 # map_20_to_90 # Now, define a FST that uses the mapper FSTs to transform factorized form to # verbalized form: # 0 -> zero # 1^ -> ten # 1^ 1 -> eleven # 9^ 1 -> ninety one # 1^^ 9^ 1 -> ['one hundred ninety one', 'hundred ninety one'] # TODO: currently only works for single digits (and doesn't work for zero) a1_9 = pn.u(*"123456789").optimize() a0_9 = (a1_9 | pn.a("0")).optimize() f1 = (((a1_9 + pn.t("", "^ ")) | "") + a0_9).optimize() f2 = ((a1_9 + pn.t("", "^^ ")) + ((a0_9 + pn.t("", "^ "))) + a0_9).optimize() f = (f2 | f1).optimize() f = pn.u(f, f + "." + a0_9.plus) map1_9 = { "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine" }
import pynini as pn import random # compose - * # concat - + # union - | fst = (pn.a("a") | pn.a("e")) + pn.t("a", pn.a("0").closure(0, 5)) | pn.t( pn.a("a").star, "0") + pn.a("xxx") fst = fst.optimize() output_strings = set() for i in range(10000): s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify() output_strings.add(s) print(len(output_strings)) for output_string in output_strings: print(output_string) def top_paths(fst, count=100): return sorted( set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths())) print("INPUTS") print("\t")
a_1_to_9 = pn.u(*"123456789").optimize() # Create an acceptor for digits 0..9 a_0_to_9 = (a_1_to_9 | pn.a("0")).optimize() # First, let's define the factorizer. # Factorizer converts numbers to their factorized form, using ^ characters # to denote powers of ten: # # 0 -> 0 # 1 -> 1 # 10 -> 1^ # 23 -> 2^ 3 # 203 -> 2^^ 3 # TODO: currently only works for 0..99 factorizer = (((a_1_to_9 + pn.t("", "^ ")) | "") + a_0_to_9).optimize() # You can debug the factorizer by generating random paths through it # print(list(pn.randgen(factorizer, 5).paths())) # Now, let's define number-to-string mappings map_1_to_9 = { "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight",