Example #1
0
  def dedup_rule(letter: str) -> pynini.Fst:
    """Compiles transducer that optionally deletes multiple letters.

    One or two of the same letter must be encountered beforehand.

    Args:
      letter: a letter.

    Returns:
      An FST deleting that in an appropriate sequence.
    """
    not_letter = byte.LOWER - letter
    return pynini.cdrewrite(
        pynini.cross(_plus(letter), _ques(letter)),
        ("[BOS]" | not_letter) + letter, ("[EOS]" | not_letter), _sigma_star)
Example #2
0
    def __init__(self, ordinal: GraphFst, deterministic: bool = True):
        super().__init__(name="date",
                         kind="verbalize",
                         deterministic=deterministic)

        day_cardinal = pynutil.delete("day: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        day = day_cardinal @ pynini.cdrewrite(
            ordinal.ordinal_stem, "", "[EOS]",
            NEMO_SIGMA) + pynutil.insert("ter")

        months_names = pynini.union(*[
            x[1]
            for x in load_labels(get_abs_path("data/months/abbr_to_name.tsv"))
        ])
        month = pynutil.delete("month: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        final_month = month @ months_names
        final_month |= month @ pynini.difference(
            NEMO_SIGMA, months_names) @ pynini.cdrewrite(
                ordinal.ordinal_stem, "", "[EOS]",
                NEMO_SIGMA) + pynutil.insert("ter")

        year = pynutil.delete("year: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        # day month year
        graph_dmy = day + pynini.accep(" ") + final_month + pynini.closure(
            pynini.accep(" ") + year, 0, 1)
        graph_dmy |= final_month + pynini.accep(" ") + year

        self.graph = graph_dmy | year
        final_graph = self.graph + delete_preserve_order

        delete_tokens = self.delete_tokens(final_graph)
        self.fst = delete_tokens.optimize()
Example #3
0
    def delete_tokens(self, fst) -> 'pynini.FstLike':
        """
        Deletes class name wrap around output of given fst

        Args:
            fst: input fst

        Returns:
            Fst: fst
        """
        res = (pynutil.delete(f"{self.name}") + delete_space +
               pynutil.delete("{") + delete_space + fst + delete_space +
               pynutil.delete("}"))
        return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "",
                                      NEMO_SIGMA)
Example #4
0
def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
    affects strings where the final value is a variation of "un".
    e.g.
        "un" -> "uno"
        "veintiún" -> "veintiuno"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    # Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
    strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""),
                             NEMO_SIGMA)
    return fst @ strip
Example #5
0
def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
    with text string, so is included for non-deterministic cases.
    e.g.
        "una" -> "un"
        "veintiuna" -> "veintiun"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    # Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
    strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""),
                             NEMO_SIGMA)
    return fst @ strip
Example #6
0
    def __init__(self, tag_label: str, matcher: pynini.FstLike,
                 sigma_star: pynini.FstLike) -> None:
        """Constructor.

    Args:
      tag_label: String used as a tag. It must be in-alphabet when processed by
        the specified token type.
      matcher: an acceptor matching the strings to be tagged.
      sigma_star: an unweighted cyclic acceptor over the vocabulary.

    Raises:
        Error: Tag is not in the alphabet.
    """
        # Builds tag transducer.
        ltag = pynutil.insert(self.LTAG_TEMPLATE.format(tag_label))
        rtag = pynutil.insert(self.RTAG_TEMPLATE.format(tag_label))
        self._tagger = pynini.cdrewrite(ltag + matcher + rtag, "", "",
                                        sigma_star).optimize()
Example #7
0
File: date.py Project: NVIDIA/NeMo
def get_four_digit_year_graph(deterministic: bool = True):
    """
    Returns a four digit transducer which is combination of ties/teen or digits
    (using hundred instead of thousand format), e.g.
    1219 -> twelve nineteen
    3900 -> thirty nine hundred
    """
    graph_ties = get_ties_graph(deterministic)

    graph_with_s = (
        (graph_ties + insert_space + graph_ties)
        | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")))
    ) + pynutil.delete("0s")

    graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s")
    graph_with_s = graph_with_s @ pynini.cdrewrite(
        pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA
    )

    graph = graph_ties + insert_space + graph_ties
    graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred")

    thousand_graph = (
        graph_digit
        + insert_space
        + pynini.cross("00", "thousand")
        + (pynutil.delete("0") | insert_space + graph_digit)
    )
    thousand_graph |= (
        graph_digit
        + insert_space
        + pynini.cross("000", "thousand")
        + pynini.closure(pynutil.delete(" "), 0, 1)
        + pynini.accep("s")
    )

    graph |= graph_with_s
    if deterministic:
        graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA)
    else:
        graph |= thousand_graph

    return graph.optimize()
Example #8
0
 def __init__(self, deterministic: bool = True):
     super().__init__(name="time",
                      kind="verbalize",
                      deterministic=deterministic)
     hour = (pynutil.delete("hours:") + delete_space +
             pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
             pynutil.delete("\""))
     minute = (pynutil.delete("minutes:") + delete_space +
               pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
               pynutil.delete("\""))
     suffix = (pynutil.delete("suffix:") + delete_space +
               pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
               pynutil.delete("\""))
     optional_suffix = pynini.closure(delete_space + insert_space + suffix,
                                      0, 1)
     zone = (pynutil.delete("zone:") + delete_space + pynutil.delete("\"") +
             pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
     optional_zone = pynini.closure(delete_space + insert_space + zone, 0,
                                    1)
     second = (pynutil.delete("seconds:") + delete_space +
               pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
               pynutil.delete("\""))
     graph_hms = (hour + pynutil.insert(" hours ") + delete_space + minute +
                  pynutil.insert(" minutes and ") + delete_space + second +
                  pynutil.insert(" seconds") + optional_suffix +
                  optional_zone)
     graph_hms @= pynini.cdrewrite(
         pynutil.delete("o ")
         | pynini.cross("one minutes", "one minute")
         | pynini.cross("one seconds", "one second")
         | pynini.cross("one hours", "one hour"),
         pynini.union(" ", "[BOS]"),
         "",
         NEMO_SIGMA,
     )
     graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone
     graph |= hour + insert_space + pynutil.insert(
         "o'clock") + optional_zone
     graph |= hour + delete_space + insert_space + suffix + optional_zone
     graph |= graph_hms
     delete_tokens = self.delete_tokens(graph)
     self.fst = delete_tokens.optimize()
Example #9
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="date", kind="verbalize", deterministic=deterministic)

        day_cardinal = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        day = strip_cardinal_apocope(day_cardinal)

        primero = pynini.cdrewrite(pynini.cross("uno", "primero"), "[BOS]", "[EOS]", NEMO_SIGMA)
        day = (
            (day @ primero) if deterministic else pynini.union(day, day @ primero)
        )  # Primero for first day is traditional, but will vary depending on region

        month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        year = (
            pynutil.delete("year: \"")
            + articles
            + NEMO_SPACE
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )

        # Insert preposition if wasn't originally with the year. This would mean a space was present
        year = pynutil.add_weight(year, -0.001)
        year |= (
            pynutil.delete("year: \"")
            + pynutil.insert("de ")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )

        # day month year
        graph_dmy = day + pynini.cross(NEMO_SPACE, " de ") + month + pynini.closure(pynini.accep(" ") + year, 0, 1)

        graph_mdy = month + NEMO_SPACE + day + pynini.closure(NEMO_SPACE + year, 0, 1)
        if deterministic:
            graph_mdy += pynutil.delete(" preserve_order: true")  # Only accepts this if was explicitly passed

        self.graph = graph_dmy | graph_mdy
        final_graph = self.graph + delete_preserve_order

        delete_tokens = self.delete_tokens(final_graph)
        self.fst = delete_tokens.optimize()
Example #10
0
def get_hundreds_graph(deterministic: bool = True):
    """
    Returns a four digit transducer which is combination of ties/teen or digits
    (using hundred instead of thousand format), e.g.
    1219 -> twelve nineteen
    3900 -> thirty nine hundred
    """
    graph_ties = get_ties_graph(deterministic)
    graph = (graph_ties + insert_space + graph_ties
             | graph_teen + insert_space + pynini.cross("00", "hundred")
             | (graph_teen + insert_space +
                (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s"))
             @ pynini.cdrewrite(
                 pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]",
                 NEMO_SIGMA)
             | pynutil.add_weight(
                 graph_digit + insert_space + pynini.cross("00", "thousand") +
                 (pynutil.delete("0") | insert_space + graph_digit),
                 weight=-0.001,
             ))
    return graph
Example #11
0
    def __construct_r14(self):
        '''
    e-epenthesis 2
    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map([
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ]).project("input"))

            tau = pynini.cross("<DEL-S>", "e")
            return pynini.cdrewrite(
                tau,
                pynini.union(
                    pynini.concat(
                        pynini.string_map(["d", "t"]).project("input"),
                        pynini.accep("m").closure(0, 1)), pynini.accep("t w")),
                "", alphabet.closure()).optimize()
Example #12
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")
        # country code, number_part, extension
        digit_to_str = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))

        double_digit = pynini.union(
            *[
                pynini.cross(
                    pynini.project(str(i) @ digit_to_str, "output")
                    + pynini.accep(" ")
                    + pynini.project(str(i) @ digit_to_str, "output"),
                    pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"),
                )
                for i in range(10)
            ]
        )
        double_digit.invert()
        number_part = (
            pynini.closure(digit_to_str + insert_space, 2, 2)
            + digit_to_str
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit_to_str + insert_space, 2, 2)
            + digit_to_str
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit_to_str + insert_space, 3, 3)
            + digit_to_str
        )
        number_part = (
            pynutil.insert("number_part: \"")
            + pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part)
            + pynutil.insert("\"")
        )

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Example #13
0
def _rw_ph_context(gr,
                   in_ph,
                   out_ph,
                   post_str=c.EPSILON,
                   pre_str=c.EPSILON) -> p.Fst:
    """Rewrites the phoneme assigned to a grapheme in context.

  In an alignment (gr=ph_in), changes the assigned phoneme, resulting in
  (gr=ph_out), based on the context such as post-nasal or word final.

  Following call

  ```
  _rw_ph_context('ans', 'nsl', 'm', pre_str=labial)
  ```
  would return:
  ```
  p.cdrewrite(p.cross('(ans=nsl)', '(ans=m)'),
                      '',
                      c.L_SIDE + labial + c.PH_END,
                      c.SIGMA_STAR).optimize()
  ```

  Args:
    gr: grapheme, ans in the example
    in_ph: input phoneme, nsl in the example
    out_ph: output phoneme, m in the example
    post_str: preceeding context, eg. if nasal, the rule is post-nasal
    pre_str: following context, eg. if labial, the rule is pre-labial

  Returns:
    Rewrite rule FST.
  """

    in_algn = c.L_BOUND + gr + c.ASSIGN + in_ph + c.R_BOUND
    out_algn = c.L_BOUND + gr + c.ASSIGN + out_ph + c.R_BOUND

    return p.cdrewrite(p.cross(in_algn, out_algn), post_str, pre_str,
                       c.SIGMA_STAR).optimize()
Example #14
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats['one_thousand_alternative'] = one_thousand_alternative
    alternative_formats['separators'] = separators
    return alternative_formats
Example #15
0
def _get_year_graph():
    """
    Transducer for year, only from 1000 - 2999 e.g.
    1290-> twelve nineteen
    2000 - 2009 will be verbalized as two thousand..
    """

    graph_ties = _get_ties_graph()
    graph = (
        graph_ties + insert_space + graph_ties
        | graph_teen + insert_space + pynini.cross("00", "hundred")
        | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s"))
        @ pynini.cdrewrite(pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA)
        | pynutil.add_weight(
            graph_digit
            + insert_space
            + pynini.cross("00", "thousand")
            + (pynutil.delete("0") | insert_space + graph_digit),
            weight=-0.001,
        )
    )
    graph = (pynini.union("1", "2") + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + pynini.closure("s", 0, 1)) @ graph
    return graph
Example #16
0
def get_quantity(decimal_graph: 'pynini.FstLike',
                 cardinal_graph: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. 2 millones -> integer_part: "dos" quantity: "millones"
    e.g. 2,4 millones -> integer_part: "dos" fractional_part: "quatro" quantity: "millones"
    e.g. 2,400 millones -> integer_part: "dos mil cuatrocientos" fractional_part: "quatro" quantity: "millones"

    Args:
        decimal_graph: DecimalFST
        cardinal_graph: CardinalFST
    """
    numbers = pynini.closure(NEMO_DIGIT, 1, 6) @ cardinal_graph
    numbers = pynini.cdrewrite(pynutil.delete(cardinal_separator), "", "",
                               NEMO_SIGMA) @ numbers

    res = (
        pynutil.insert("integer_part: \"") +
        numbers  # The cardinal we're passing only produces 'un' for one, so gender agreement is safe (all quantities are masculine). Limit to 10^6 power.
        + pynutil.insert("\"") + NEMO_SPACE + pynutil.insert("quantity: \"") +
        quantities + pynutil.insert("\""))
    res |= decimal_graph + NEMO_SPACE + pynutil.insert(
        "quantity: \"") + quantities + pynutil.insert("\"")
    return res
Example #17
0
    def __init__(self):

        super().__init__()

        attr_map_0 = pynini.transducer(self.cable_digits, '#жил')
        attr_map_1 = pynini.transducer(self.cable_digits, 'Длина_кабеля')
        attr_map_2 = pynini.transducer(self.cable_digits, 'Диаметр')
        attr_map_3 = pynini.transducer(self.cable_digits, '#соединительных_проводов')
        attr_map_4 = pynini.transducer(' . ', ' Диаметр ')

        attr_map_0_lc = self.ngram_comb
        attr_map_0_rc = self.cable_splitters
        attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize()

        attr_map_3_lc = self.cable_splitters
        attr_map_3_rc = self.cable_splitters
        attr_map_3_s = pynini.cdrewrite(attr_map_3, attr_map_3_lc, attr_map_3_rc, self.alphabet).optimize()

        attr_map_2_lc = self.cable_floats
        attr_map_2_rc = self.cable_length_0
        attr_map_2_s = pynini.cdrewrite(attr_map_2, attr_map_2_lc, attr_map_2_rc, self.alphabet).optimize()

        attr_map_4_lc = self.cable_splitters
        attr_map_4_rc = self.cable_floats
        attr_map_4_s = pynini.cdrewrite(attr_map_2, attr_map_4_lc, attr_map_4_rc, self.alphabet).optimize()

        attr_map_5_lc = self.cable_digits
        attr_map_5_rc = self.cable_digits
        attr_map_5_s = pynini.cdrewrite(attr_map_4, attr_map_5_lc, attr_map_5_rc, self.alphabet).optimize()

        attr_map_6_rc = self.cable_length_0
        attr_map_6_lc = self.cable_length_1
        attr_map_6_s = pynini.cdrewrite(attr_map_1, attr_map_6_rc, attr_map_6_lc, self.alphabet).optimize()

        attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_2_s, attr_map_4_s).optimize(), attr_map_5_s).optimize()

        self.rules = {
            'жилы': attr_map_0_s,
            'соединительные_провода': attr_map_3_s,
            'диаметр': attr_map_comp_0,
            'длина_кабеля': attr_map_6_s
        }
Example #18
0
    def __init__(self):
        super().__init__()

        attr_map_0 = pynini.transducer(self.cable_digits, '#жил')
        attr_map_1 = pynini.transducer(self.cable_digits, 'Сечение_кабеля')
        attr_map_2 = pynini.transducer(self.cable_digits, 'Длина_кабеля')
        attr_map_3 = pynini.transducer('.', 'Сечение_кабеля')

        attr_map_0_rc = self.cable_splitters
        attr_map_0_lc = pynini.union(" ")
        attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize()

        attr_map_1_rc = self.cable_splitters
        attr_map_1_lc = self.cable_floats
        attr_map_1_s = pynini.cdrewrite(attr_map_1, attr_map_1_rc, attr_map_1_lc, self.alphabet).optimize()

        attr_map_2_rc = self.cable_floats
        attr_map_2_lc = self.cable_length_0
        attr_map_2_s = pynini.cdrewrite(attr_map_1, attr_map_2_rc, attr_map_2_lc, self.alphabet).optimize()

        attr_map_3_rc = self.cable_length_0
        attr_map_3_lc = self.cable_length_1
        attr_map_3_s = pynini.cdrewrite(attr_map_2, attr_map_3_rc, attr_map_3_lc, self.alphabet).optimize()

        attr_map_4_rc = self.cable_digits
        attr_map_4_lc = self.cable_digits
        attr_map_4_s = pynini.cdrewrite(attr_map_3, attr_map_4_rc, attr_map_4_lc, self.alphabet).optimize()

        attr_map_5_rc = self.cable_splitters
        attr_map_5_lc = self.cable_length_0
        attr_map_5_s = pynini.cdrewrite(attr_map_1, attr_map_5_rc, attr_map_5_lc, self.alphabet).optimize()

        attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_1_s, attr_map_2_s).optimize(), attr_map_4_s).optimize()

        self.rules = {
            'жилы': attr_map_0_s,
            'сечение_кабеля_0': attr_map_comp_0,
            'длина_кабеля': attr_map_3_s,
            'сечение_кабеля_1': attr_map_5_s
        }
Example #19
0
def _composed_typ() -> p.Fst:
  """Maps multiple ISO characters to single native characters."""

  compose_diphthong_aux = (p.cross('(a)(i)', '(ai)') |
                           p.cross('(a)(u)', '(au)'))

  compose_diphthong = p.cdrewrite(compose_diphthong_aux,
                                  '', '', c.SIGMA_STAR).optimize()

  compose_vocalic_aux = (p.cross('(l)(vocal)', '(l_vocal)') |
                         p.cross('(r)(vocal)', '(r_vocal)'))

  compose_vocalic = p.cdrewrite(compose_vocalic_aux,
                                '', '', c.SIGMA_STAR).optimize()

  compose_retroflex_vocalic_aux = (p.cross('(l_vocal)(long)', '(ll_vocal)') |
                                   p.cross('(r_vocal)(long)', '(rr_vocal)'))

  compose_retroflex_vocalic = p.cdrewrite(compose_retroflex_vocalic_aux,
                                          '', '', c.SIGMA_STAR).optimize()

  compose_ind_vowel_aux = (p.cross('(ind)(a)', '(a_i)') |
                           p.cross('(ind)(aa)', '(aa_i)') |
                           p.cross('(ind)(ac)', '(ac_i)') |
                           p.cross('(ind)(e)', '(e_i)') |
                           p.cross('(ind)(ee)', '(ee_i)') |
                           p.cross('(ind)(ec)', '(ec_i)') |
                           p.cross('(ind)(i)', '(i_i)') |
                           p.cross('(ind)(ii)', '(ii_i)') |
                           p.cross('(ind)(o)', '(o_i)') |
                           p.cross('(ind)(oo)', '(oo_i)') |
                           p.cross('(ind)(oc)', '(oc_i)') |
                           p.cross('(ind)(u)', '(u_i)') |
                           p.cross('(ind)(uu)', '(uu_i)') |
                           p.cross('(ind)(ai)', '(ai_i)') |
                           p.cross('(ind)(au)', '(au_i)') |
                           p.cross('(ind)(l_vocal)', '(l_vocal_i)') |
                           p.cross('(ind)(ll_vocal)', '(ll_vocal_i)') |
                           p.cross('(ind)(r_vocal)', '(r_vocal_i)') |
                           p.cross('(ind)(rr_vocal)', '(rr_vocal_i)'))

  compose_ind_vowel = p.cdrewrite(compose_ind_vowel_aux,
                                  '', '', c.SIGMA_STAR).optimize()

  compose_aspiration_aux = (p.cross('(b)(asp)', '(bh)') |
                            p.cross('(c)(asp)', '(ch)') |
                            p.cross('(d)(asp)', '(dh)') |
                            p.cross('(dd)(asp)', '(ddh)') |
                            p.cross('(g)(asp)', '(gh)') |
                            p.cross('(j)(asp)', '(jh)') |
                            p.cross('(k)(asp)', '(kh)') |
                            p.cross('(p)(asp)', '(ph)') |
                            p.cross('(rd)(asp)', '(rdh)') |
                            p.cross('(t)(asp)', '(th)') |
                            p.cross('(tt)(asp)', '(tth)'))

  compose_aspiration = p.cdrewrite(compose_aspiration_aux,
                                   '', '', c.SIGMA_STAR).optimize()

  compose_candra = p.cdrewrite(p.cross('(m)(candra)', '(cnd)'),
                               '', '', c.SIGMA_STAR).optimize()

  # Malayalam chillu characters
  compose_chillu_aux = (p.cross('(k)(chl)', '(k_chl)') |
                        p.cross('(l)(chl)', '(l_chl)') |
                        p.cross('(ll)(chl)', '(ll_chl)') |
                        p.cross('(n)(chl)', '(n_chl)') |
                        p.cross('(nn)(chl)', '(nn_chl)') |
                        p.cross('(rr)(chl)', '(rr_chl)') |
                        p.cross('(r)(chl)', '(reph)'))

  compose_chillu = p.cdrewrite(compose_chillu_aux,
                               '', '', c.SIGMA_STAR).optimize()

  # Marathi eyelash ra
  compose_eyelash = p.cdrewrite(p.cross('(r)(eye)', '(reye)'),
                                '', '', c.SIGMA_STAR).optimize()

  compose_om = p.cdrewrite(p.cross('(ot)(m)', '(om)'),
                           '', '', c.SIGMA_STAR).optimize()

  return (compose_diphthong @
          compose_vocalic @
          compose_retroflex_vocalic @
          compose_ind_vowel @
          compose_aspiration @
          compose_candra @
          compose_chillu @
          compose_eyelash @
          compose_om).optimize()
Example #20
0
chars = ([chr(i) for i in range(1, 91)] + ["\\[", "\\]", "\\\\"] +
         [chr(i) for i in range(94, 256)])
sigma_star = pynini.union(*chars).closure()
sigma_star.optimize()

input_string = "Do you have Camembert or Edam?"  # Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>?
cheeses = ("Boursin", "Camembert", "Cheddar", "Edam", "Gruyere", "Ilchester",
           "Jarlsberg", "Red Leicester", "Stilton")
output_string = "Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>"

fst_target = pynini.string_map(cheeses)
ltag = pynini.transducer("", "<cheese>")
rtag = pynini.transducer("", "</cheese>")
substitution = ltag + fst_target + rtag

rewrite = pynini.cdrewrite(substitution, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).stringify()

#######################################################################################################################

singular_map = pynini.union(
    pynini.transducer("feet", "foot"),
    pynini.transducer("pence", "penny"),

    # Any sequence of bytes ending in "ches" strips the "es";
    # the last argument -1 is a "weight" that gives this analysis a higher priority, if it matches the input.
    sigma_star + pynini.transducer("ches", "ch", -1),

    # Any sequence of bytes ending in "s" strips the "s".
    sigma_star + pynini.transducer("s", ""))
Example #21
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone", kind="classify")

        # create `single_digits` and `double_digits` graphs as these will be
        # the building blocks of possible telephone numbers
        single_digits = pynini.invert(graph_digit).optimize() | pynini.cross(
            "0", "cero")

        double_digits = pynini.union(
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.delete("0")),
            (graph_ties + insert_space + pynutil.insert("y") + insert_space +
             graph_digit),
        )
        double_digits = pynini.invert(double_digits)

        # define `ten_digit_graph`, `nine_digit_graph`, `eight_digit_graph`
        # which produces telephone numbers spoken (1) only with single digits,
        # or (2) spoken with double digits (and sometimes single digits)

        # 10-digit option (1): all single digits
        ten_digit_graph = (pynini.closure(single_digits + insert_space, 3, 3) +
                           pynutil.delete("-") +
                           pynini.closure(single_digits + insert_space, 3, 3) +
                           pynutil.delete("-") +
                           pynini.closure(single_digits + insert_space, 3, 3) +
                           single_digits)

        # 9-digit option (1): all single digits
        nine_digit_graph = (
            pynini.closure(single_digits + insert_space, 3, 3) +
            pynutil.delete("-") +
            pynini.closure(single_digits + insert_space, 3, 3) +
            pynutil.delete("-") +
            pynini.closure(single_digits + insert_space, 2, 2) + single_digits)

        # 8-digit option (1): all single digits
        eight_digit_graph = (
            pynini.closure(single_digits + insert_space, 4, 4) +
            pynutil.delete("-") +
            pynini.closure(single_digits + insert_space, 3, 3) + single_digits)

        if not deterministic:
            # 10-digit option (2): (1+2) + (1+2) + (2+2) digits
            ten_digit_graph |= (single_digits + insert_space + double_digits +
                                insert_space + pynutil.delete("-") +
                                single_digits + insert_space + double_digits +
                                insert_space + pynutil.delete("-") +
                                double_digits + insert_space + double_digits)

            # 9-digit option (2): (1+2) + (1+2) + (1+2) digits
            nine_digit_graph |= (single_digits + insert_space + double_digits +
                                 insert_space + pynutil.delete("-") +
                                 single_digits + insert_space + double_digits +
                                 insert_space + pynutil.delete("-") +
                                 single_digits + insert_space + double_digits)

            # 8-digit option (2): (2+2) + (2+2) digits
            eight_digit_graph |= (double_digits + insert_space +
                                  double_digits + insert_space +
                                  pynutil.delete("-") + double_digits +
                                  insert_space + double_digits)

        number_part = pynini.union(ten_digit_graph, nine_digit_graph,
                                   eight_digit_graph)
        number_part @= pynini.cdrewrite(pynini.cross(ones, "uno"), "", "",
                                        NEMO_SIGMA)

        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Example #22
0
def Rewrite(rule: pynini.FstLike,
            sigma: pynini.Fst = byte.BYTE,
            left: pynini.FstLike = "",
            right: pynini.FstLike = "") -> pynini.Fst:
    return pynini.optimize(pynini.cdrewrite(rule, left, right, sigma.star))
Example #23
0
    def __init__(self, decimal: GraphFst, cardinal: GraphFst,
                 fraction: GraphFst, deterministic: bool):
        super().__init__(name="measure",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_decimal_masc = decimal.delete_tokens(decimal.graph_masc)
        graph_decimal_fem = decimal.delete_tokens(decimal.graph_fem)
        graph_cardinal_masc = cardinal.delete_tokens(cardinal.graph_masc)
        graph_cardinal_fem = cardinal.delete_tokens(cardinal.graph_fem)
        graph_fraction_fem = fraction.delete_tokens(fraction.graph_fem)
        graph_fraction_masc = fraction.delete_tokens(fraction.graph_masc)

        unit_masc = (unit_plural_masc | unit_singular_masc) + pynini.closure(
            NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1)
        unit_masc |= "por" + pynini.closure(NEMO_NOT_QUOTE, 1)
        unit_masc = pynutil.delete("units: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) @ unit_masc) + pynutil.delete("\"")

        unit_fem = (unit_plural_fem | unit_singular_fem) + pynini.closure(
            NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1)
        unit_fem = pynutil.delete("units: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) @ unit_fem) + pynutil.delete("\"")

        graph_masc = (graph_cardinal_masc
                      | graph_decimal_masc) + NEMO_WHITE_SPACE + unit_masc
        graph_masc |= graph_fraction_masc + NEMO_WHITE_SPACE + pynutil.insert(
            "de ") + unit_masc
        graph_masc |= pynutil.add_weight(
            graph_fraction_masc
            @ (NEMO_SIGMA + pynini.union("medio", "medios")) +
            NEMO_WHITE_SPACE + unit_masc,
            -0.001)  # "medio litro" not "medio de litro"

        graph_fem = (graph_cardinal_fem
                     | graph_decimal_fem) + NEMO_WHITE_SPACE + unit_fem
        graph_fem |= graph_fraction_fem + NEMO_WHITE_SPACE + pynutil.insert(
            "de ") + unit_fem
        graph_fem |= pynutil.add_weight(
            graph_fraction_fem @ (NEMO_SIGMA + pynini.union("media", "medias"))
            + NEMO_WHITE_SPACE + unit_fem, -0.001)

        graph = graph_masc | graph_fem

        graph = (pynini.cdrewrite(
            pynutil.insert(" de"), "quantity: \"" +
            pynini.closure(NEMO_NOT_QUOTE, 1), "\"", NEMO_SIGMA) @ graph
                 )  # billones de xyz

        graph @= pynini.cdrewrite(pynini.cross(ones, "uno"), "",
                                  NEMO_WHITE_SPACE + "por", NEMO_SIGMA)

        # To manage alphanumeric combonations ("a-8, 5x"), we let them use a weighted default path.
        alpha_num_unit = pynutil.delete("units: \"") + pynini.closure(
            NEMO_NOT_QUOTE) + pynutil.delete("\"")
        graph_alpha_num = pynini.union(
            (graph_cardinal_masc | graph_decimal_masc) + NEMO_SPACE +
            alpha_num_unit,
            alpha_num_unit + delete_extra_space +
            (graph_cardinal_masc | graph_decimal_masc),
        )

        graph |= pynutil.add_weight(graph_alpha_num, 0.01)

        graph += delete_preserve_order

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Example #24
0
####Ensures regular path is default
transducer_adessive_base.set_start(0)
transducer_inessive_base.set_start(0)

transducer_adessive_base.optimize()
transducer_inessive_base.optimize()

######Morphophonemic Rules################
#Consonant Gradation rules.
double_consonants_reduce = pynini.string_map([["kk", "k"], ["pp", "p"],
                                              ["tt", "t"], ["lk", "l"],
                                              ["t", "d"]])
##Courtesy of http://www.lysator.liu.se/language/Languages/Finnish/Grammar.html and https://web.stanford.edu/~kiparsky/Papers/finnish.article.pdf
consonant_reduction = pynini.cdrewrite(double_consonants_reduce,
                                       "l" | vowels | "n", vowels + suffixes,
                                       closure).optimize()

#Vowel insertion to break consonant clusters caused by suffixes
insertion = pynini.cdrewrite(pynini.transducer("", "e"), consonants, suffixes,
                             closure).optimize()

#Finnish seems to attempt preserving morae count with /s/ as a syllabic end.  Generates a stop that assimilates 'highness' of vowel and becomes /k/
#In case this generated stop occurs after VV, it instead assimilates /s/ and becomes /t/.  Then gradation occurs due to /e/ insertion
#Similar situation seemed to occur with /s/ -> /a/ / /a/_ + suffix.  So was added to transducer.
final_stress_preservation = pynini.cdrewrite(
    pynini.transducer("s", "t"), vowels +
    (pynini.acceptor("y") | "u"), suffixes, closure) * pynini.cdrewrite(
        pynini.transducer("", "k"),
        pynini.acceptor("y") | "u",
        "s" + suffixes, closure) * pynini.cdrewrite(
Example #25
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="time", kind="verbalize", deterministic=deterministic)

        change_minutes = pynini.cdrewrite(alt_minutes, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA)

        morning_phrases = pynini.cross("am", "de la mañana")
        afternoon_phrases = pynini.cross("pm", "de la tarde")
        evening_phrases = pynini.cross("pm", "de la noche")

        # For the 12's
        mid_times = pynini.accep("doce")
        mid_phrases = (
            pynini.string_map([("pm", "del mediodía"), ("am", "de la noche")])
            if deterministic
            else pynini.string_map(
                [
                    ("pm", "de la mañana"),
                    ("pm", "del día"),
                    ("pm", "del mediodía"),
                    ("am", "de la noche"),
                    ("am", "de la medianoche"),
                ]
            )
        )

        hour = (
            pynutil.delete("hours:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )
        minute = (
            pynutil.delete("minutes:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )
        minute = (minute @ change_minutes) if deterministic else pynini.union(minute, minute @ change_minutes)

        suffix = (
            pynutil.delete("suffix:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )
        zone = (
            pynutil.delete("zone:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )
        optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1)
        second = (
            pynutil.delete("seconds:")
            + delete_space
            + pynutil.delete("\"")
            + pynini.closure(NEMO_NOT_QUOTE, 1)
            + pynutil.delete("\"")
        )

        graph_hms = (
            hour
            + pynutil.insert(" horas ")
            + delete_space
            + minute
            + pynutil.insert(" minutos y ")
            + delete_space
            + second
            + pynutil.insert(" segundos")
        )

        graph_hm = hour + delete_space + pynutil.insert(" y ") + minute
        graph_hm |= pynini.union(
            (hour @ morning_times)
            + delete_space
            + pynutil.insert(" y ")
            + minute
            + delete_space
            + insert_space
            + (suffix @ morning_phrases),
            (hour @ afternoon_times)
            + delete_space
            + pynutil.insert(" y ")
            + minute
            + delete_space
            + insert_space
            + (suffix @ afternoon_phrases),
            (hour @ evening_times)
            + delete_space
            + pynutil.insert(" y ")
            + minute
            + delete_space
            + insert_space
            + (suffix @ evening_phrases),
            (hour @ mid_times)
            + delete_space
            + pynutil.insert(" y ")
            + minute
            + delete_space
            + insert_space
            + (suffix @ mid_phrases),
        )

        graph_h = pynini.union(
            hour,
            (hour @ morning_times) + delete_space + insert_space + (suffix @ morning_phrases),
            (hour @ afternoon_times) + delete_space + insert_space + (suffix @ afternoon_phrases),
            (hour @ evening_times) + delete_space + insert_space + (suffix @ evening_phrases),
            (hour @ mid_times) + delete_space + insert_space + (suffix @ mid_phrases),
        )

        graph = (graph_hms | graph_hm | graph_h) + optional_zone

        if not deterministic:
            graph_style_1 = pynutil.delete(" style: \"1\"")
            graph_style_2 = pynutil.delete(" style: \"2\"")

            graph_menos = hour + delete_space + pynutil.insert(" menos ") + minute + graph_style_1
            graph_menos |= (
                (hour @ morning_times)
                + delete_space
                + pynutil.insert(" menos ")
                + minute
                + delete_space
                + insert_space
                + (suffix @ morning_phrases)
                + graph_style_1
            )
            graph_menos |= (
                (hour @ afternoon_times)
                + delete_space
                + pynutil.insert(" menos ")
                + minute
                + delete_space
                + insert_space
                + (suffix @ afternoon_phrases)
                + graph_style_1
            )
            graph_menos |= (
                (hour @ evening_times)
                + delete_space
                + pynutil.insert(" menos ")
                + minute
                + delete_space
                + insert_space
                + (suffix @ evening_phrases)
                + graph_style_1
            )
            graph_menos |= (
                (hour @ mid_times)
                + delete_space
                + pynutil.insert(" menos ")
                + minute
                + delete_space
                + insert_space
                + (suffix @ mid_phrases)
                + graph_style_1
            )
            graph_menos += optional_zone

            graph_para = minute + pynutil.insert(" para las ") + delete_space + hour + graph_style_2
            graph_para |= (
                minute
                + pynutil.insert(" para las ")
                + delete_space
                + (hour @ morning_times)
                + delete_space
                + insert_space
                + (suffix @ morning_phrases)
                + graph_style_2
            )
            graph_para |= (
                minute
                + pynutil.insert(" para las ")
                + delete_space
                + (hour @ afternoon_times)
                + delete_space
                + insert_space
                + (suffix @ afternoon_phrases)
                + graph_style_2
            )
            graph_para |= (
                minute
                + pynutil.insert(" para las ")
                + delete_space
                + (hour @ evening_times)
                + delete_space
                + insert_space
                + (suffix @ evening_phrases)
                + graph_style_2
            )
            graph_para |= (
                minute
                + pynutil.insert(" para las ")
                + delete_space
                + (hour @ mid_times)
                + delete_space
                + insert_space
                + (suffix @ mid_phrases)
                + graph_style_2
            )
            graph_para += optional_zone
            graph_para @= pynini.cdrewrite(
                pynini.cross(" las ", " la "), "para", "una", NEMO_SIGMA
            )  # Need agreement with one

            graph |= graph_menos | graph_para
        delete_tokens = self.delete_tokens(graph + delete_preserve_order)
        self.fst = delete_tokens.optimize()
Example #26
0
]

gemination_map = [
    ("s", "ss"),
    ("k", "kk"),
    ("t", "tt"),
]

monograph_bos_map = [
    ("じ", "dʑi"),
    ("ず", "dzɯ"),
    ("ぞ", "dzo"),
]

G2P = (
    cdrewrite(cross("は", "ɰɑ"), "[BOS]", "[EOS]", SIGMA_STAR)
    @ cdrewrite(string_map(digraph_bos_map), "[BOS]", "", SIGMA_STAR)
    @ cdrewrite(string_map(digraph_map), "", "", SIGMA_STAR)
    @ cdrewrite(string_map(monograph_bos_map), "[BOS]", "", SIGMA_STAR)
    @ cdrewrite(cross("ん", "n"), "ː", "", SIGMA_STAR)
    @ cdrewrite(cross("ん", "ɴ"), "", "", SIGMA_STAR)
    @ cdrewrite(string_map(long_vowel_map), "", "", SIGMA_STAR)
    @ cdrewrite(string_map(context_free_map), "", "", SIGMA_STAR)
    @ cdrewrite(string_map(nasalization_map), "", "ɴ", SIGMA_STAR)
    @ cdrewrite(
        string_map(devoicing_map),
        union(voiceless_consonants),
        union(voiceless_consonants, "[EOS]"),
        SIGMA_STAR,
    )
    @ cdrewrite(string_map(gemination_map), sokuon, "", SIGMA_STAR)
#measure

back_vowel = pynini.union("u", "o", "a")
neutral_vowel = pynini.union("i", "e")
front_vowel = pynini.union("y", "ö", "ä")
vowel = pynini.union(back_vowel, neutral_vowel, front_vowel)
archiphoneme = pynini.union("A", "I", "E", "O", "U")
consonant = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                         "p", "q", "r", "s", "t", "v", "w", "x", "z")
sigma_star = pynini.union(vowel, consonant, archiphoneme).closure().optimize()

adessive = "llA"
intervener = pynini.union(consonant, neutral_vowel).closure()
adessive_harmony = (
    pynini.cdrewrite(pynini.transducer("A", "a"), back_vowel + intervener, "",
                     sigma_star) *
    pynini.cdrewrite(pynini.t("A", "ä"), "", "", sigma_star)).optimize()


def make_adessive(stem):
    return ((stem + adessive) * adessive_harmony).stringify()


make_adessive("training")

singular_map = pynini.union(
    pynini.transducer("feet", "foot"),
    pynini.transducer("pence", "penny"),
    # Any sequence of bytes ending in "ches" strips the "es";
    # the last argument -1 is a "weight" that gives this analysis
    # a higher priority, if it matches the input.
Example #28
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="verbalize",
                         deterministic=deterministic)

        # Derivational strings append 'avo' as a suffix. Adding space for processing aid
        fraction_stem = pynutil.insert(" avo")
        plural = pynutil.insert("s")
        conjunction = pynutil.insert(" y ")

        integer = (pynutil.delete("integer_part: \"") +
                   strip_cardinal_apocope(pynini.closure(NEMO_NOT_QUOTE)) +
                   pynutil.delete("\""))

        numerator_one = pynutil.delete("numerator: \"") + pynini.accep(
            "un") + pynutil.delete("\" ")
        numerator = (pynutil.delete("numerator: \"") +
                     pynini.difference(pynini.closure(NEMO_NOT_QUOTE), "un") +
                     pynutil.delete("\" "))

        denominator_add_stem = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) + fraction_stem +
            pynutil.delete("\" morphosyntactic_features: \"add_root\""))
        denominator_ordinal = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) +
            pynutil.delete("\" morphosyntactic_features: \"ordinal\""))
        denominator_cardinal = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\""))

        denominator_singular = pynini.union(denominator_add_stem,
                                            denominator_ordinal)
        if not deterministic:
            # Occasional exceptions
            denominator_singular |= denominator_add_stem @ pynini.string_map(
                [("once avo", "undécimo"), ("doce avo", "duodécimo")])
        denominator_plural = denominator_singular + plural

        # Merging operations
        merge = pynini.cdrewrite(
            pynini.cross(" y ", "i"), "", "", NEMO_SIGMA
        )  # The denominator must be a single word, with the conjunction "y" replaced by i
        merge @= pynini.cdrewrite(delete_space, "",
                                  pynini.difference(NEMO_CHAR, "parte"),
                                  NEMO_SIGMA)

        # The merger can produce duplicate vowels. This is not allowed in orthography
        delete_duplicates = pynini.string_map([("aa", "a"),
                                               ("oo", "o")])  # Removes vowels
        delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "",
                                             NEMO_SIGMA)

        remove_accents = pynini.cdrewrite(
            accents,
            pynini.union(NEMO_SPACE, pynini.accep("[BOS]")) +
            pynini.closure(NEMO_NOT_SPACE),
            pynini.closure(NEMO_NOT_SPACE) +
            pynini.union("avo", "ava", "ésimo", "ésima"),
            NEMO_SIGMA,
        )
        merge_into_single_word = merge @ remove_accents @ delete_duplicates

        fraction_default = numerator + delete_space + insert_space + (
            denominator_plural @ merge_into_single_word)

        fraction_with_one = (numerator_one + delete_space + insert_space +
                             (denominator_singular @ merge_into_single_word))

        fraction_with_cardinal = strip_cardinal_apocope(numerator
                                                        | numerator_one)
        fraction_with_cardinal += (
            delete_space + pynutil.insert(" sobre ") +
            strip_cardinal_apocope(denominator_cardinal))

        if not deterministic:
            # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine
            # Other rules will manage use of "un" at end, so just worry about endings
            exceptions = pynini.string_map([("tercia", "tercera")])
            apply_exceptions = pynini.cdrewrite(exceptions, "", "", NEMO_SIGMA)
            vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "",
                                            pynini.accep("[EOS]"), NEMO_SIGMA)

            denominator_singular_fem = shift_cardinal_gender(
                denominator_singular) @ vowel_change @ apply_exceptions
            denominator_plural_fem = denominator_singular_fem + plural

            numerator_one_fem = shift_cardinal_gender(numerator_one)
            numerator_fem = shift_cardinal_gender(numerator)

            fraction_with_cardinal |= (
                (numerator_one_fem | numerator_fem) + delete_space +
                pynutil.insert(" sobre ") +
                shift_cardinal_gender(denominator_cardinal))

            # Still need to manage stems
            merge_stem = pynini.cdrewrite(
                delete_space, "", pynini.union("avo", "ava", "avos", "avas"),
                NEMO_SIGMA)  # For managing alternative spacing
            merge_stem @= remove_accents @ delete_duplicates

            fraction_with_one_fem = numerator_one_fem + delete_space + insert_space
            fraction_with_one_fem += pynini.union(
                denominator_singular_fem @ merge_stem, denominator_singular_fem
                @ merge_into_single_word)  # Both forms exists
            fraction_with_one_fem += pynutil.insert(" parte")
            fraction_with_one_fem @= pynini.cdrewrite(
                pynini.cross("una media", "media"), "", "",
                NEMO_SIGMA)  # "media" not "una media"

            fraction_default_fem = numerator_fem + delete_space + insert_space
            fraction_default_fem += pynini.union(
                denominator_plural_fem @ merge_stem,
                denominator_plural_fem @ merge_into_single_word)
            fraction_default_fem += pynutil.insert(" partes")

            fraction_default |= (numerator + delete_space + insert_space +
                                 denominator_plural @ merge_stem
                                 )  # Case of no merger
            fraction_default |= fraction_default_fem

            fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem
            fraction_with_one |= fraction_with_one_fem

        fraction_with_one @= pynini.cdrewrite(pynini.cross(
            "un medio", "medio"), "", "", NEMO_SIGMA)  # "medio" not "un medio"

        fraction = fraction_with_one | fraction_default | fraction_with_cardinal
        graph_masc = pynini.closure(integer + delete_space + conjunction, 0,
                                    1) + fraction

        # Manage cases of fem gender (only shows on integer except for "medio")
        integer_fem = shift_cardinal_gender(integer)
        fraction_default |= (
            shift_cardinal_gender(numerator) + delete_space + insert_space +
            (denominator_plural @ pynini.cross("medios", "medias")))
        fraction_with_one |= (
            pynutil.delete(numerator_one) + delete_space +
            (denominator_singular @ pynini.cross("medio", "media")))

        fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal
        graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0,
                                   1) + fraction_fem

        self.graph_masc = pynini.optimize(graph_masc)
        self.graph_fem = pynini.optimize(graph_fem)

        self.graph = graph_masc | graph_fem

        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Example #29
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        graph_hundred = pynini.cross("hundred", "")

        graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0"))
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_teen | pynutil.insert("00"),
            (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
        )

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
        )
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit
        )

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
            pynutil.insert("000", weight=0.1),
        )

        graph_million = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"),
            pynutil.insert("000", weight=0.1),
        )
        graph_billion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_trillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quadrillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quintillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_sextillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_sextillion
            + delete_space
            + graph_quintillion
            + delete_space
            + graph_quadrillion
            + delete_space
            + graph_trillion
            + delete_space
            + graph_billion
            + delete_space
            + graph_million
            + delete_space
            + graph_thousands
            + delete_space
            + graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
        )

        labels_exception = [num_to_word(x) for x in range(0, 13)]
        graph_exception = pynini.union(*labels_exception)

        graph = (
            pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA)
            @ (NEMO_ALPHA + NEMO_SIGMA)
            @ graph
        )

        self.graph_no_exception = graph

        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1
        )

        final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Example #30
0
                  "n", "ɲ", "o", "p", "r", "ɾ", "s", "ʃ", "t", "u", "w", "x",
                  "z")
_sigma_star = pynini.union(_g, _p).closure().optimize()

# Rules.
_r1 = pynini.cdrewrite(
    pynini.string_map([
        ("ch", "tʃ"),
        ("ll", "ʝ"),
        ("qu", "k"),
        ("j", "x"),
        ("ñ", "ɲ"),
        ("v", "b"),
        ("x", "s"),
        ("y", "j"),
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ü", "w"),
    ]),
    "",
    "",
    _sigma_star,
).optimize()
_r2 = pynini.cdrewrite(pynutil.delete("h"), "", "", _sigma_star).optimize()
_v = pynini.union("a", "e", "i", "o", "u")
_r3 = pynini.cdrewrite(pynini.cross("r", "ɾ"), _v, _v, _sigma_star).optimize()
_r4 = pynini.cdrewrite(pynini.cross("rr", "r"), "", "", _sigma_star).optimize()
_r5 = pynini.cdrewrite(pynini.string_map([("c", "s"), ("g", "x")]), "",