Exemple #1
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     return pynini.concat(
         pynini.transducer("",
                           "<Kompos_Stems>",
                           output_token_type=self.__syms.alphabet),
         pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Sg>",
                                 token_type=self.__syms.alphabet))),
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Pl>",
                                 token_type=self.__syms.alphabet))))), tmp),
         pynini.acceptor("<NN>", token_type=self.__syms.alphabet),
         pynini.transducer(
             "", "<kompos> <nativ>",
             output_token_type=self.__syms.alphabet)).optimize()
Exemple #2
0
    def __construct_r1(self):
        '''
    Umlaut

    Apfel$ ==> Äpfel
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>",
                    "<NoDef>", "<UL>", "<FB>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # r1a
        tau = pynini.push(pynini.string_map(
            [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"),
             ("U", "Ü")],
            input_token_type=self.__syms.alphabet,
            output_token_type=self.__syms.alphabet),
                          push_labels=True)
        lc = pynini.union(
            self.__syms.consonants,
            pynini.string_map(
                ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()
        r1a = pynini.cdrewrite(
            tau, lc,
            pynini.concat(
                alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure())

        # r1c
        tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet)
        r1c = pynini.cdrewrite(
            tau,
            pynini.string_map(
                ["ä", "Ä"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            pynini.concat(
                self.__syms.consonants_lower, alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure()).optimize()

        # r1d
        r1d = pynini.cdrewrite(
            pynini.transducer("<UL>",
                              "<FB>",
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet), "", "",
            alphabet.closure())

        return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
Exemple #3
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     with pynini.default_token_type(self.__syms.alphabet):
         kompos_stems = pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.cross(
                         "",
                         pynini.concat(
                             pynini.accep("<+NN>"),
                             pynini.concat(self.__syms.gender,
                                           pynini.accep("<Nom> <Sg>")))),
                     pynini.cross(
                         "",
                         pynini.concat(
                             pynini.accep("<+NN>"),
                             pynini.concat(self.__syms.gender,
                                           pynini.accep("<Nom> <Pl>")))))),
             tmp)
         return (pynini.cross("", "<Kompos_Stems>") + kompos_stems +
                 pynini.accep("<NN>") +
                 pynini.cross("", "<kompos> <nativ>")).optimize()
Exemple #4
0
    def __construct_category_filter(self):
        '''
    Filter-out non-matching category sequences
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
            self.__syms.categories,
        ).closure()

        filtering = self.__suff_stems_filter([
            "<ABK>", "<ADJ>", "<ADV>", "<CARD>", "<DIGCARD>", "<NE>", "<NN>",
            "<PRO>", "<V>", "<ORD>"
        ])

        return pynini.concat(
            pynini.concat(alphabet, filtering).closure(),
            self.__tail).optimize()
Exemple #5
0
def generate_fst_digit():
    fst_dict = {}

    fst_single_digit = generate_fst_for_factor_digit(0, True)

    for factor in range(0, 10):
        fst_dict[factor] = generate_fst_for_factor_digit(factor)

    fst = pn.a("")
    for num_places in range(1, 10):
        fst_for_x_digit_num = pn.a("")

        for num_place in range(num_places, 0, -1):
            if num_places == 1:
                fst_for_x_digit_num = pn.concat(fst_for_x_digit_num,
                                                fst_single_digit)
            else:
                fst_for_x_digit_num = pn.concat(fst_for_x_digit_num,
                                                fst_dict[num_place - 1])

        fst = pn.union(fst, fst_for_x_digit_num)

    comma_numbers = pn.u(".", ",") + pn.u(*"0123456789").star
    fst = fst + pn.u("", comma_numbers)
    fst = fst.optimize()
    return fst
Exemple #6
0
  def __construct_suff_phon(self):
    '''
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"),
          self.__syms.stem_types,
          ).closure()

      Tau = pynini.cross("i", "")
      Lambda = pynini.concat(
          pynini.union(
            pynini.accep("i"),
            pynini.concat(
              self.__syms.consonants.project("input"),
              pynini.accep("y")
              )
            ),
          pynini.accep("<Suff_Stems>")
          )

      return pynini.concat(
          pynini.cdrewrite(
            Tau,
            Lambda,
            "",
            alphabet.project("input")
            ),
          self.__tail
          ).optimize()
Exemple #7
0
def load_lexicon(source, symbol_table):
    '''
  Load lexica entries from source interpreting them using a given symbol table.
  '''
    lex = pynini.Fst()
    lex.set_input_symbols(symbol_table)
    lex.set_output_symbols(symbol_table)
    # longest match, prefer complex over simple symbols
    tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U)
    for line in source:
        line = line.strip()
        if line:
            tmp = pynini.Fst()
            tmp.set_input_symbols(symbol_table)
            tmp.set_output_symbols(symbol_table)
            start = tmp.add_state()
            tmp.set_start(start)
            tmp.set_final(start)
            for token in tokenizer.findall(line):
                if token[1]:
                    tmp = pynini.concat(
                        tmp,
                        pynini.transducer(token[0],
                                          token[1],
                                          input_token_type=symbol_table,
                                          output_token_type=symbol_table))
                else:
                    tmp = pynini.concat(
                        tmp, pynini.acceptor(token[0],
                                             token_type=symbol_table))
            lex = pynini.union(lex, tmp)
    return lex
Exemple #8
0
    def __construct_suff_phon(self):
        '''
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
        ).closure()

        Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet)
        Lambda = pynini.concat(
            pynini.union(
                pynini.acceptor("i", token_type=self.__syms.alphabet),
                pynini.concat(
                    self.__syms.consonants.project(),
                    pynini.acceptor("y", token_type=self.__syms.alphabet))),
            pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet))

        return pynini.concat(
            pynini.cdrewrite(Tau, Lambda, "", alphabet.project()),
            self.__tail).optimize()
Exemple #9
0
    def __construct_del_ge(self):
        '''
    Case-dependent deletion of the ge marker
    '''

        # delete <ge> at certain suffixes like 'ver'
        return pynini.concat(
            pynini.transducer("<no-ge>",
                              "",
                              input_token_type=self.__syms.alphabet),
            pynini.concat(
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.string_map(["<n>", "<e>", "<d>",
                                           "<~n>"]).project()).closure(),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        pynini.acceptor(
                            "<NoDef>",
                            token_type=self.__syms.alphabet).closure(0, 1),
                        pynini.transducer(
                            "<ge>", "", input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        self.__syms.stem_type_features,
                        pynini.acceptor(
                            "<nativ>",
                            token_type=self.__syms.alphabet))))).optimize()
Exemple #10
0
  def __construct_compound_filter(self):
    '''
    Construct the compound filter
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"),
          self.__syms.stem_types,
          pynini.cross(self.__syms.categories, ""),
          pynini.cross(self.__syms.origin_features, ""),
          pynini.cross("<NoPref>", "")
          )

      return pynini.concat(
          pynini.union(
            pynini.cross("<Initial>", ""),
            pynini.accep("<NoHy>"),
            pynini.accep("<NoDef>")
            ).closure(0,1),
          pynini.concat(
            pynini.union(
              pynini.concat(
                alphabet.closure(),
                pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "")
                ),
              pynini.concat(
                pynini.cross("", "<VADJ>"),
                pynini.concat(
                  pynini.union(
                    alphabet,
                    pynini.cross("<kompos>", "")
                    ).closure(),
                  pynini.concat(
                    pynini.cross("<kompos>", ""),
                    pynini.concat(
                      alphabet.closure(),
                      pynini.cross("<V>", "")
                      )
                    )
                  )
                ),
              pynini.concat(
                pynini.union(
                  alphabet,
                  pynini.cross("<kompos>", "")
                  ).closure(),
                pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "")
                )
              ),
            pynini.concat(
              pynini.cross("<base>", ""),
              pynini.concat(
                pynini.cross(self.__syms.origin_features, ""),
                self.__syms.inflection_classes
                )
              )
            )
          ).optimize()
Exemple #11
0
    def __construct_tail(self):
        '''
    Define possible final sequences of a derivation
    '''

        # C1
        initial_stuff = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>",
                    "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>", "<Pref_Stems>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).closure()
        # C2
        intermediate_stuff = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>",
                    "<FB>", "<ge>", "<Suff_Stems>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).closure()

        # C3
        final_stuff = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>",
                    "<FB>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.categories, self.__syms.stem_type_features,
            self.__syms.origin_features,
            pynini.string_map(
                [
                    "<NSNeut_es_e>", "<NSFem_0_n>", "<NSFem_0_en>",
                    "<NSMasc_es_e>", "<NSMasc_es_$e>", "<NSMasc-s/$sse>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).closure()

        # TAIL
        return pynini.concat(
            pynini.concat(initial_stuff, self.__syms.base_stem_types,
                          intermediate_stuff).closure(0, 1), final_stuff,
            self.__syms.inflection_classes.closure(0, 1)).optimize()
Exemple #12
0
    def __construct_r20(self):
        '''
    Up to low

    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<^UC>", "<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        #
        # SFST uses a rewrite rule here
        return pynini.push(pynini.union(
            alphabet.closure(),
            pynini.concat(
                pynini.transducer(
                    "<CB>", "",
                    input_token_type=self.__syms.alphabet).closure(1),
                pynini.union(
                    pynini.string_map(
                        ["<^UC>", "<NoHy>", "<NoDef>"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    self.__syms.to_lower))).closure(),
                           push_labels=True).optimize()
Exemple #13
0
    def __construct_r21(self):
        '''
    Low to up

    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        self.__syms.to_upper.draw("to_upper.dot")
        # Construction in SFST involves negation (which is expensiv).
        # It looks like we can do better:
        return pynini.push(pynini.union(
            alphabet.closure(),
            pynini.concat(
                pynini.transducer(
                    "<^UC>", "",
                    input_token_type=self.__syms.alphabet).closure(1),
                pynini.union(
                    pynini.string_map(
                        ["<NoHy>", "<NoDef>"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    self.__syms.to_upper))).closure(),
                           push_labels=True).optimize()
Exemple #14
0
    def __construct_r14(self):
        '''
    e-epenthesis 2
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        tau = pynini.transducer("<DEL-S>",
                                "e",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet)
        return pynini.cdrewrite(
            tau,
            pynini.union(
                pynini.concat(
                    pynini.string_map(
                        ["d", "t"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    pynini.acceptor("m",
                                    token_type=self.__syms.alphabet).closure(
                                        0, 1)),
                pynini.acceptor("t w", token_type=self.__syms.alphabet)), "",
            alphabet.closure()).optimize()
Exemple #15
0
  def __construct_umlautung(self):
    '''
    Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present.
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"),
          self.__syms.stem_types,
          self.__syms.categories,
          ).closure()

      return pynini.concat(
          pynini.concat(
            alphabet,
            pynini.concat(
              self.__syms.consonants,
              pynini.concat(
                pynini.union(
                  pynini.union(
                    pynini.cross("a", "ä"),
                    pynini.cross("o", "ö"),
                    pynini.cross("u", "ü")
                    ),
                  pynini.concat(
                    pynini.cross("a", "ä"),
                    pynini.union(
                      pynini.cross("a", ""),
                      pynini.accep("u")
                      )
                    )
                  ),
                pynini.concat(
                  self.__syms.consonants.closure(),
                  pynini.concat(
                    pynini.concat(
                      pynini.accep("e"),
                      pynini.string_map(["l", "r"]).project("input")
                      ).closure(0, 1),
                    pynini.concat(
                      pynini.accep("<Suff_Stems>"),
                      pynini.cross("<UL>", "")
                      )
                    )
                  )
                )
              ).closure(0, 1)
            ),
          self.__tail
          ).optimize()
Exemple #16
0
  def __construct_del_ge(self):
    '''
    Case-dependent deletion of the ge marker
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      # delete <ge> at certain prefixes like 'ver'
      return pynini.concat(
          pynini.cross("<no-ge>", ""),
          pynini.concat(
            pynini.accep("<Pref_Stems>"),
            pynini.concat(
              pynini.union(
                self.__syms.characters,
                pynini.string_map(["<n>", "<e>", "<d>", "<~n>"]).project("input")
                ).closure(),
              pynini.cross("<V> <nativ>", "") + pynini.accep("<NoDef>").closure(0, 1) + pynini.cross("<ge>", "") + self.__prefix_filter_helper + self.__syms.stem_type_features + pynini.accep("<nativ>")
              )
            )
          ).optimize()
Exemple #17
0
 def __construct_pref_stems(self):
     '''
 Prefix stems
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Pref_Stems>",
                             token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Exemple #18
0
 def __construct_base_stems(self):
     '''
 Base stems
 '''
     return pynini.compose(
         self.__bdk_stems,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Base_Stems>",
                             token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Exemple #19
0
    def __construct_origin_filter(self):
        '''
    Filter-out non-matching origin feature sequences
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>",
                    "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types, self.__syms.categories,
            self.__syms.stem_type_features).closure().optimize()

        filtering = self.__suff_stems_filter([
            "<nativ>", "<prefnativ>", "<frei>", "<gebunden>", "<kurz>",
            "<lang>", "<fremd>", "<klassisch>", "<NSNeut_es_e>", "<NSFem_0_n>",
            "<NSFem_0_en>", "<NSMasc_es_e>", "<NSMasc_es_$e>",
            "<NSMasc-s/$sse>", "<NGeo-$er-NMasc_s_0>", "<NGeo-$er-Adj0-Up>",
            "<NGeo-$isch-Adj+>", "<NGeo-0-Name-Fem_0>", "<NGeo-0-Name-Masc_s>",
            "<NGeo-0-Name-Neut_s>", "<NGeo-a-Name-Fem_s>",
            "<NGeo-a-Name-Neut_s>", "<NGeo-aner-NMasc_s_0>",
            "<NGeo-aner-Adj0-Up>", "<NGeo-anisch-Adj+>", "<NGeo-e-NMasc_n_n>",
            "<NGeo-e-Name-Fem_0>", "<NGeo-e-Name-Neut_s>",
            "<NGeo-ei-Name-Fem_0>", "<NGeo-en-Name-Neut_s>",
            "<NGeo-er-NMasc_s_0>", "<NGeo-er-Adj0-Up>", "<NGeo-0-NMasc_s_0>",
            "<NGeo-0-Adj0-Up>", "<NGeo-erisch-Adj+>", "<NGeo-ese-NMasc_n_n>",
            "<NGeo-esisch-Adj+>", "<NGeo-ianer-NMasc_s_0>",
            "<NGeo-ianisch-Adj+>", "<NGeo-ien-Name-Neut_s>",
            "<NGeo-ier-NMasc_s_0>", "<NGeo-isch-Adj+>",
            "<NGeo-istan-Name-Neut_s>", "<NGeo-land-Name-Neut_s>",
            "<NGeo-ner-NMasc_s_0>", "<NGeo-ner-Adj0-Up>", "<NGeo-nisch-Adj+>"
        ])

        return pynini.concat(
            pynini.concat(alphabet, filtering).closure(),
            self.__tail).optimize()
Exemple #20
0
def join(expr: pynini.FstLike, sep: pynini.FstLike) -> pynini.Fst:
    """Creates the automaton expr (sep expr)^*.

  Args:
    expr: an acceptor or string.
    sep: a separator acceptor or string.

  Returns:
    An FST.
  """
    cdr = pynini.concat(sep, expr).closure()
    return expr + cdr
Exemple #21
0
 def __construct_bdk_stems(self):
     '''
 Base, derivation and compound stems (without derivation suffixes)
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.string_map(
                 ["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"],
                 input_token_type=self.__syms.alphabet,
                 output_token_type=self.__syms.alphabet).project(),
             self.__sigma_star)).optimize()
Exemple #22
0
  def __construct_stem_type_filter(self):
    '''
    Filter-out non-matching stem type sequences
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"),
          self.__syms.stem_types,
          self.__syms.categories,
          ).closure()

      filtering = self.__suff_stems_filter(["<deriv>", "<kompos>"])

      return pynini.concat(
          pynini.concat(
            alphabet,
            filtering
            ).closure(),
          self.__tail
          ).optimize()
Exemple #23
0
 def __construct_pref_deriv_suff_stems(self):
     '''
 Derivation suffixes which combine with prefixed stems
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Suff_Stems>",
                             token_type=self.__syms.alphabet),
             pynini.transducer("<prefderiv>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Exemple #24
0
    def __init__(self, syms, sublexica, deko_filter, inflection, phon):

        #
        # store alphabet
        self.__syms = syms

        #
        # run parts of morphology building (cf. timur_fst)
        tmp = (sublexica.verbal_pref_stems + sublexica.base_stems
               ) * sublexica.nodef_to_null * deko_filter.pref_filter
        tmp = (sublexica.base_stems | tmp) * deko_filter.compound_filter

        # ANY TODO: Move to symbols!
        alphabet = pynini.union(
            syms.characters, syms.stem_types,
            pynini.string_map([
                "<FB>", "<SS>", "<n>", "<~n>", "<e>", "<d>", "<Ge-Nom>",
                "<UL>", "<NoHy>", "<NoDef>", "<ge>", "<Ge>", "<no-ge>", "<CB>"
            ],
                              input_token_type=syms.alphabet,
                              output_token_type=syms.alphabet).project()
        ).closure().optimize()

        tmp = (tmp + inflection.inflection) * (
            alphabet + inflection.inflection_filter
        ) * deko_filter.infix_filter * deko_filter.uplow

        tmp = pynini.compose(
            pynini.concat(
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
                tmp,
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
            ), phon.phon).optimize()

        #
        # default stems

        # create a default composition stem for nouns
        self.__compound_stems_nn = self.__construct_compound_stems_nn(tmp)

        # create a deriv stem for Ge nominalization (Gelerne)
        self.__ge_nom_stems_v = self.__construct_ge_nom_stems_v(tmp)

        # create an adjective base stem from participles
        self.__participle_adj = self.__construct_participle_adj(tmp, sublexica)
        self.__participle_adj.draw("participle_adj.dot", portrait=True)
Exemple #25
0
    def __construct_ge_nom_stems_v(self, tmp):
        '''
    Stems for ge nominalization of verbs ("Gejammer")
    '''
        with pynini.default_token_type(self.__syms.alphabet):
            alphabet = pynini.union(
                self.__syms.characters, self.__syms.categories,
                pynini.string_map(["<CONV>", "<SUFF>"]).project("input"))

            # extract infinitives
            infinitives = pynini.compose(
                pynini.concat(
                    pynini.concat(self.__syms.characters.closure(1),
                                  pynini.accep("<PREF>")).closure(),
                    pynini.concat(alphabet.closure(1),
                                  pynini.cross("", "<+V> <Inf>"))),
                tmp).optimize()

            insert_ge = pynini.concat(
                pynini.concat(self.__syms.characters.closure(1),
                              pynini.accep("<PREF>")).closure(),
                pynini.concat(pynini.cross("g e <PREF> <Ge>", ""),
                              alphabet.closure(1))).optimize()

            inserted_ge = pynini.compose(
                pynini.compose(insert_ge, infinitives).project("input"),
                pynini.union(self.__syms.to_lower, self.__syms.categories,
                             self.__syms.prefix_suffix_marker,
                             pynini.accep("<Ge>")).closure()).optimize()

            deriv_stem_filter_ge = pynini.compose(
                pynini.compose(
                    pynini.compose(
                        pynini.union(alphabet, pynini.accep("<PREF>"),
                                     pynini.cross("", "<Ge>")).closure(),
                        inserted_ge),
                    pynini.union(
                        self.__syms.characters, pynini.accep("<Ge>"),
                        pynini.cross(
                            pynini.union(self.__syms.categories,
                                         self.__syms.prefix_suffix_marker),
                            "")).closure()),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.accep("<Ge>"),
                    ).closure(1), pynini.cross("e n", ""))).optimize()

            return (pynini.cross("", "<Deriv_Stems>") + deriv_stem_filter_ge +
                    pynini.accep("<V>") +
                    pynini.cross("", "<deriv> <nativ>")).optimize()
Exemple #26
0
 def __suff_stems_filter(self, features):
     '''
 Return a union over filters for each feature given
 '''
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.acceptor("<Suff_Stems>",
                                  token_type=self.__syms.alphabet)
     for feature in features:
         to_eps = pynini.transducer(feature,
                                    "",
                                    input_token_type=self.__syms.alphabet)
         filtering = pynini.union(filtering,
                                  pynini.concat(to_eps, suff_stems, to_eps))
     return filtering.optimize()
Exemple #27
0
 def __construct_quant_suff_stems(self):
     '''
 Derivation suffixes which combine with a number and a simplex stem
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             pynini.transducer("<QUANT>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Suff_Stems>",
                             token_type=self.__syms.alphabet),
             pynini.transducer("<simplex>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Exemple #28
0
    def __construct_insert_zu(self):
        '''
    Inserts "zu" into infinitives with separable prefixes
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>",
                    "<^Gen>", "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(alphabet,
                          self.__syms.stem_types).closure().optimize()

        # From deko.fst:
        # insert "zu" after verbal prefixes if followed by infinitive marker
        return pynini.union(
            c2,
            #pynini.concat(
            #  pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet),
            #  alphabet.closure(),
            #  pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet),
            #  alphabet.closure()
            #  ),
            pynini.concat(
                c2,
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.acceptor("<Base_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.transducer("",
                                  "z u",
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^zz>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()
Exemple #29
0
    def __construct_r14(self):
        '''
    e-epenthesis 2
    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map([
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ]).project("input"))

            tau = pynini.cross("<DEL-S>", "e")
            return pynini.cdrewrite(
                tau,
                pynini.union(
                    pynini.concat(
                        pynini.string_map(["d", "t"]).project("input"),
                        pynini.accep("m").closure(0, 1)), pynini.accep("t w")),
                "", alphabet.closure()).optimize()
Exemple #30
0
    def __construct_r20(self):
        '''
    Up to low

    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map(["<^UC>", "<NoHy>",
                                   "<NoDef>"]).project("input"))

            #
            # SFST uses a rewrite rule here
            return pynini.push(pynini.union(
                alphabet.closure(),
                pynini.concat(
                    pynini.cross("<CB>", "").closure(1),
                    pynini.union(
                        pynini.string_map(["<^UC>", "<NoHy>",
                                           "<NoDef>"]).project("input"),
                        self.__syms.to_lower))).closure(),
                               push_labels=True).optimize()