Example #1
0
  def __construct_compound_filter(self):
    '''
    Construct the compound filter
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"),
          self.__syms.stem_types,
          pynini.cross(self.__syms.categories, ""),
          pynini.cross(self.__syms.origin_features, ""),
          pynini.cross("<NoPref>", "")
          )

      return pynini.concat(
          pynini.union(
            pynini.cross("<Initial>", ""),
            pynini.accep("<NoHy>"),
            pynini.accep("<NoDef>")
            ).closure(0,1),
          pynini.concat(
            pynini.union(
              pynini.concat(
                alphabet.closure(),
                pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "")
                ),
              pynini.concat(
                pynini.cross("", "<VADJ>"),
                pynini.concat(
                  pynini.union(
                    alphabet,
                    pynini.cross("<kompos>", "")
                    ).closure(),
                  pynini.concat(
                    pynini.cross("<kompos>", ""),
                    pynini.concat(
                      alphabet.closure(),
                      pynini.cross("<V>", "")
                      )
                    )
                  )
                ),
              pynini.concat(
                pynini.union(
                  alphabet,
                  pynini.cross("<kompos>", "")
                  ).closure(),
                pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "")
                )
              ),
            pynini.concat(
              pynini.cross("<base>", ""),
              pynini.concat(
                pynini.cross(self.__syms.origin_features, ""),
                self.__syms.inflection_classes
                )
              )
            )
          ).optimize()
Example #2
0
  def __construct_tail(self):
    '''
    Define possible final sequences of a derivation
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      # C1
      initial_stuff = pynini.union(
        self.__syms.characters,
        pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<Pref_Stems>"]).project("input")
        ).closure()
      # C2
      intermediate_stuff = pynini.union(
        self.__syms.characters,
        pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<ge>", "<Suff_Stems>"]).project("input")
        ).closure()

      # C3
      final_stuff = pynini.union(
        self.__syms.characters,
        pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>"]).project("input"),
        self.__syms.categories,
        self.__syms.stem_type_features,
        self.__syms.origin_features,
        pynini.string_map(["<NSNeut_es_e>", "<NSFem_0_n>", "<NSFem_0_en>", "<NSMasc_es_e>", "<NSMasc_es_$e>", "<NSMasc-s/$sse>"]).project("input")
        ).closure()

      # TAIL
      tail1 = initial_stuff + self.__syms.base_stem_types + intermediate_stuff
      return pynini.concat(tail1.closure(0,1) + final_stuff, self.__syms.inflection_classes.closure(0,1)).optimize()
Example #3
0
    def __construct_r21(self):
        '''
    Low to up

    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        self.__syms.to_upper.draw("to_upper.dot")
        # Construction in SFST involves negation (which is expensiv).
        # It looks like we can do better:
        return pynini.push(pynini.union(
            alphabet.closure(),
            pynini.concat(
                pynini.transducer(
                    "<^UC>", "",
                    input_token_type=self.__syms.alphabet).closure(1),
                pynini.union(
                    pynini.string_map(
                        ["<NoHy>", "<NoDef>"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    self.__syms.to_upper))).closure(),
                           push_labels=True).optimize()
Example #4
0
    def __construct_r20(self):
        '''
    Up to low

    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<^UC>", "<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        #
        # SFST uses a rewrite rule here
        return pynini.push(pynini.union(
            alphabet.closure(),
            pynini.concat(
                pynini.transducer(
                    "<CB>", "",
                    input_token_type=self.__syms.alphabet).closure(1),
                pynini.union(
                    pynini.string_map(
                        ["<^UC>", "<NoHy>", "<NoDef>"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    self.__syms.to_lower))).closure(),
                           push_labels=True).optimize()
Example #5
0
    def __construct_r14(self):
        '''
    e-epenthesis 2
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        tau = pynini.transducer("<DEL-S>",
                                "e",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet)
        return pynini.cdrewrite(
            tau,
            pynini.union(
                pynini.concat(
                    pynini.string_map(
                        ["d", "t"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    pynini.acceptor("m",
                                    token_type=self.__syms.alphabet).closure(
                                        0, 1)),
                pynini.acceptor("t w", token_type=self.__syms.alphabet)), "",
            alphabet.closure()).optimize()
Example #6
0
    def __construct_r13(self):
        '''
    e-epenthesis 1
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        return pynini.union(
            alphabet,
            pynini.transducer(
                pynini.string_map(
                    [
                        "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>",
                        "<^Ax>", "<WB>"
                    ],
                    input_token_type=self.__syms.alphabet,
                    output_token_type=self.__syms.alphabet).project(),
                "")).closure().optimize()
Example #7
0
    def __construct_r1(self):
        '''
    Umlaut

    Apfel$ ==> Äpfel
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>",
                    "<NoDef>", "<UL>", "<FB>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # r1a
        tau = pynini.push(pynini.string_map(
            [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"),
             ("U", "Ü")],
            input_token_type=self.__syms.alphabet,
            output_token_type=self.__syms.alphabet),
                          push_labels=True)
        lc = pynini.union(
            self.__syms.consonants,
            pynini.string_map(
                ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()
        r1a = pynini.cdrewrite(
            tau, lc,
            pynini.concat(
                alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure())

        # r1c
        tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet)
        r1c = pynini.cdrewrite(
            tau,
            pynini.string_map(
                ["ä", "Ä"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            pynini.concat(
                self.__syms.consonants_lower, alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure()).optimize()

        # r1d
        r1d = pynini.cdrewrite(
            pynini.transducer("<UL>",
                              "<FB>",
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet), "", "",
            alphabet.closure())

        return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
Example #8
0
def get_names():
    """
    Returns the graph that matched common male and female names.
    """
    male_labels = load_labels(get_abs_path("data/roman/male.tsv"))
    female_labels = load_labels(get_abs_path("data/roman/female.tsv"))
    male_labels.extend([[x[0].upper()] for x in male_labels])
    female_labels.extend([[x[0].upper()] for x in female_labels])
    names = pynini.string_map(male_labels).optimize()
    names |= pynini.string_map(female_labels).optimize()
    return names
Example #9
0
  def __construct_umlautung(self):
    '''
    Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present.
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"),
          self.__syms.stem_types,
          self.__syms.categories,
          ).closure()

      return pynini.concat(
          pynini.concat(
            alphabet,
            pynini.concat(
              self.__syms.consonants,
              pynini.concat(
                pynini.union(
                  pynini.union(
                    pynini.cross("a", "ä"),
                    pynini.cross("o", "ö"),
                    pynini.cross("u", "ü")
                    ),
                  pynini.concat(
                    pynini.cross("a", "ä"),
                    pynini.union(
                      pynini.cross("a", ""),
                      pynini.accep("u")
                      )
                    )
                  ),
                pynini.concat(
                  self.__syms.consonants.closure(),
                  pynini.concat(
                    pynini.concat(
                      pynini.accep("e"),
                      pynini.string_map(["l", "r"]).project("input")
                      ).closure(0, 1),
                    pynini.concat(
                      pynini.accep("<Suff_Stems>"),
                      pynini.cross("<UL>", "")
                      )
                    )
                  )
                )
              ).closure(0, 1)
            ),
          self.__tail
          ).optimize()
Example #10
0
 def __init__(self, chat_lexicon_path: str, lm_path: str) -> None:
     self._lm = pynini.Fst.read(lm_path)
     assert self._lm.output_symbols(), "No LM output symbol table found"
     self._lm_syms = self._lm.output_symbols()
     lexicon = [w for (l, w) in self._lm_syms if l > 0]
     lexicon_fsa = pynini.string_map(lexicon).optimize()
     self._deduplicator = chatspeak.Deduplicator(lexicon_fsa)
     self._deabbreviator = chatspeak.Deabbreviator(lexicon_fsa)
     self._regexps = chatspeak.Regexps()
     self._lexicon = chatspeak.Lexicon(chat_lexicon_path)
     lm_mapper = pynini.string_map(lexicon,
                                   input_token_type="byte",
                                   output_token_type=self._lm_syms)
     self._bytes_to_lm_mapper = pynutil.join(lm_mapper, " ").optimize()
     self._lm_to_bytes_mapper = pynini.invert(self._bytes_to_lm_mapper)
Example #11
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { serial { value: "c three two five b" } }
        """
        alpha = NEMO_ALPHA

        if self.deterministic:
            num_graph = self.single_digits_graph
        else:
            num_graph = self.graph
            letter_pronunciation = pynini.string_map(
                load_labels(get_abs_path("data/letter_pronunciation.tsv")))
            alpha |= letter_pronunciation

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter) + next_alpha_or_num

        if not self.deterministic:
            serial_graph += pynini.closure(
                pynini.accep("s") | pynini.cross("s", "es"), 0, 1)
        return serial_graph
Example #12
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    # Adapted from
    # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm
    # Specifies common ways of delimiting thousands in digit strings.
    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats[
        'one_thousand_alternative'] = one_thousand_alternative.optimize()
    alternative_formats['separators'] = separators.optimize()
    return alternative_formats
Example #13
0
 def testVerifyAsciiDefinition(self):
     ascii_char = pynini.string_map(
         # UTF-8 ASCII uses the all single byte characters with most
         # significant bit set to 0, barring NUL, which we ignore.
         pynini.escape(chr(codepoint))
         for codepoint in range(1, 128)).optimize()
     self.assertFsasEquivalent(ascii_char, utf8.SINGLE_BYTE)
Example #14
0
 def _get_whitelist_non_deterministic_graph(
         file="data/whitelist_alternatives.tsv"):
     whitelist = load_labels(get_abs_path(file))
     whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
     whitelist_cased = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist_lower + whitelist_cased)
     return graph
Example #15
0
    def __construct_del_ge(self):
        '''
    Case-dependent deletion of the ge marker
    '''

        # delete <ge> at certain suffixes like 'ver'
        return pynini.concat(
            pynini.transducer("<no-ge>",
                              "",
                              input_token_type=self.__syms.alphabet),
            pynini.concat(
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.string_map(["<n>", "<e>", "<d>",
                                           "<~n>"]).project()).closure(),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        pynini.acceptor(
                            "<NoDef>",
                            token_type=self.__syms.alphabet).closure(0, 1),
                        pynini.transducer(
                            "<ge>", "", input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        self.__syms.stem_type_features,
                        pynini.acceptor(
                            "<nativ>",
                            token_type=self.__syms.alphabet))))).optimize()
Example #16
0
    def __construct_category_filter(self):
        '''
    Filter-out non-matching category sequences
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
            self.__syms.categories,
        ).closure()

        filtering = self.__suff_stems_filter([
            "<ABK>", "<ADJ>", "<ADV>", "<CARD>", "<DIGCARD>", "<NE>", "<NN>",
            "<PRO>", "<V>", "<ORD>"
        ])

        return pynini.concat(
            pynini.concat(alphabet, filtering).closure(),
            self.__tail).optimize()
Example #17
0
    def __construct_suff_phon(self):
        '''
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
        ).closure()

        Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet)
        Lambda = pynini.concat(
            pynini.union(
                pynini.acceptor("i", token_type=self.__syms.alphabet),
                pynini.concat(
                    self.__syms.consonants.project(),
                    pynini.acceptor("y", token_type=self.__syms.alphabet))),
            pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet))

        return pynini.concat(
            pynini.cdrewrite(Tau, Lambda, "", alphabet.project()),
            self.__tail).optimize()
Example #18
0
  def __construct_suff_phon(self):
    '''
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"),
          self.__syms.stem_types,
          ).closure()

      Tau = pynini.cross("i", "")
      Lambda = pynini.concat(
          pynini.union(
            pynini.accep("i"),
            pynini.concat(
              self.__syms.consonants.project("input"),
              pynini.accep("y")
              )
            ),
          pynini.accep("<Suff_Stems>")
          )

      return pynini.concat(
          pynini.cdrewrite(
            Tau,
            Lambda,
            "",
            alphabet.project("input")
            ),
          self.__tail
          ).optimize()
Example #19
0
  def __construct_insert_zu(self):
    '''
    Inserts "zu" into infinitives with separable prefixes
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input")
          ).optimize()

      c2 = pynini.union(
          alphabet,
          self.__syms.stem_types
          ).closure().optimize()
      
      # From deko.fst:
      # insert "zu" after verbal prefixes if followed by infinitive marker
      return pynini.union(
          c2,
          #pynini.concat(
          #  pynini.accep("<Base_Stems>"),
          #  alphabet.closure(),
          #  pynini.cross("<^zz>", ""),
          #  alphabet.closure()
          #  ),
          c2
          + pynini.accep("<Pref_Stems>")
          + alphabet.closure()
          + pynini.accep("<Base_Stems>")
          + pynini.cross("", "z u")
          + alphabet.closure()
          + pynini.cross("<^zz>", "")
          + alphabet.closure()
          ).optimize()
Example #20
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)
        graph_digit = digit | zero

        if not deterministic:
            graph = pynini.union(graph_digit, cardinal.hundreds, cardinal.tens)
            graph += pynini.closure(insert_space + graph)

        else:
            # General pattern seems to be 1-3 digits: map as cardinal, default to digits otherwise \
            graph = pynini.union(
                graph_digit,
                cardinal.tens,
                cardinal.hundreds,
                graph_digit + pynini.closure(insert_space + graph_digit, 3),
                zero + pynini.closure(insert_space + zero) +
                pynini.closure(insert_space +
                               graph_digit),  # For cases such as "1,010"
            )

        # Need to strip apocope everywhere BUT end of string
        reverse_apocope = pynini.string_map([("un", "uno"), ("ún", "uno")])
        apply_reverse_apocope = pynini.cdrewrite(reverse_apocope, "",
                                                 NEMO_SPACE, NEMO_SIGMA)
        graph @= apply_reverse_apocope

        # Technically decimals should be space delineated groups of three, e.g. (1,333 333). This removes any possible spaces
        strip_formatting = pynini.cdrewrite(delete_space, "", "", NEMO_SIGMA)
        graph = strip_formatting @ graph

        self.graph = graph.optimize()

        graph_separator = pynutil.delete(decimal_separator)
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        self.graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")

        # Integer graph maintains apocope except for ones place
        graph_integer = (strip_cardinal_apocope(
            cardinal.graph) if deterministic else pynini.union(
                cardinal.graph, strip_cardinal_apocope(cardinal.graph))
                         )  # Gives us forms w/ and w/o apocope
        self.graph_integer = pynutil.insert(
            "integer_part: \"") + graph_integer + pynutil.insert("\"")
        final_graph_wo_sign = self.graph_integer + graph_separator + insert_space + self.graph_fractional

        self.final_graph_wo_negative = (final_graph_wo_sign | get_quantity(
            final_graph_wo_sign, cardinal.graph).optimize())
        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Example #21
0
 def _get_whitelist_graph(input_case, file):
     whitelist = load_labels(file)
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Example #22
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Example #23
0
 def __split_disjunctive_feats(self, disjunctive_feat_list):
   single_splits = []
   for disjunctive_feat in disjunctive_feat_list:
     splitted = []
     for cat in disjunctive_feat[1:-1].split(","):
       splitted.append("<" + cat + ">")
       single_splits.append(pynini.transducer(disjunctive_feat, pynini.string_map(splitted, input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet))
   return pynini.union(*(single_splits)).optimize()
Example #24
0
 def testVerifyUtf8CharRegionalIndicatorSymbolDefinition(self):
     regional_indicator = pynini.string_map(
         # Regional indicator symbols have codepoints in the range 0x1F1E6
         # through 0x1F1FF.
         pynini.escape(chr(codepoint))
         for codepoint in range(0x1F1E6, 0x1F1FF + 1)).optimize()
     self.assertFsasEquivalent(
         regional_indicator, utf8.VALID_UTF8_CHAR_REGIONAL_INDICATOR_SYMBOL)
Example #25
0
 def setUpClass(cls):
     super().setUpClass()
     fold = pynini.string_map((("A", "a"), ("B", "b"))).optimize()
     cls.far_path = tempfile.mkstemp(suffix=".far")[1]
     with pynini.Far(cls.far_path, "w") as far:
         far["DOWNCASE"] = fold
         far["UPCASE"] = fold.invert()
     cls.cascade = rule_cascade.RuleCascade(cls.far_path)
Example #26
0
 def _get_whitelist_graph(input_case, file):
     whitelist = load_labels(file)
     if input_case == "lower_cased":
         whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
     else:
         whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
Example #27
0
    def __construct_r19(self):
        '''
    Eliminate markers
    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map(["<CB>", "<^UC>", "<NoHy>",
                                   "<NoDef>"]).project("input"))

            return pynini.union(
                alphabet,
                pynini.cross(
                    pynini.string_map([
                        "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>",
                        "<^Ax>", "<WB>"
                    ]).project("input"), "")).closure().optimize()
Example #28
0
def transducerOfRule(mapping, leftContext, rightContext, alphabet):
    valid = sandwich.union(*alphabet).closure()
    language = sandwich.union(*(['.'] + alphabet)).closure()

    return sandwich.cdrewrite(sandwich.string_map(mapping),
                              leftContext,
                              rightContext,
                              language,
                              direction="sim") * valid
Example #29
0
        def generate_formula(in_to_out: List[Tuple[str]],
                             envs: Tuple[Set[str]]):
            env_formulas = list()
            #   for env in envs:
            #       env_formula = pynini.accep("0").star
            #       if env != [""]:
            #           for j in range(0, len(env)):
            #               env_formula = env_formula + pynini.accep("0").star + pynini.union(*env[j])
            #       env_formulas.append(env_formula + pynini.accep("0").star)
            # for env in envs:
            #    env_formula = pynini,env[0]
            for i in range(len(envs)):
                if envs[i] != [""] and (len(envs[i]) > 1
                                        or in_to_out[0][0] != '' or i > 0):
                    env_formula = pynini.union(*envs[i][0])
                    for j in range(
                            1,
                            len(envs[i]) -
                            int(in_to_out[0][0] == '' and i == 0)):
                        env_formula = (env_formula +
                                       pynini.union(*envs[i][j])).optimize()
                else:
                    env_formula = pynini.accep("")
                env_formulas.append(pynini.rmepsilon(env_formula.optimize()))

            if in_to_out[0][0] == '':
                str_map: pynini.Fst = pynini.string_map(
                    (e_v, e_v + in_to_out[0][1])
                    for e_v in envs[0][len(envs[0]) - 1])
                return pynini.cdrewrite(str_map.ques,
                                        env_formulas[0],
                                        env_formulas[1],
                                        self.sigma_star,
                                        direction="ltr").optimize()

            else:
                str_map: pynini.Fst = pynini.string_map(
                    in_to_out).ques.rmepsilon().optimize()

            return pynini.cdrewrite(str_map.ques,
                                    env_formulas[0],
                                    env_formulas[1],
                                    self.sigma_star,
                                    direction="ltr").optimize()
Example #30
0
 def testVerifyUtf8Rfc3629Definition(self):
     utf8_rfc3629_char = pynini.string_map(
         # UTF-8 encoded strings can store codepoints in U+0000 through
         # U+0x10FFFF, excluding the surrogate halves in U+D800 through
         # U+DFFF, but we exclude U+0000 as it would be strange to match NUL
         # and that label is reserved for epsilon.
         pynini.escape(chr(codepoint))
         for codepoint in range(1, 0x10FFFF + 1)
         if not 0xD800 <= codepoint <= 0xDFFF).optimize()
     self.assertFsasEquivalent(utf8_rfc3629_char, utf8.VALID_UTF8_CHAR)