Ejemplo n.º 1
0
    def __construct_suff_phon(self):
        '''
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
        ).closure()

        Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet)
        Lambda = pynini.concat(
            pynini.union(
                pynini.acceptor("i", token_type=self.__syms.alphabet),
                pynini.concat(
                    self.__syms.consonants.project(),
                    pynini.acceptor("y", token_type=self.__syms.alphabet))),
            pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet))

        return pynini.concat(
            pynini.cdrewrite(Tau, Lambda, "", alphabet.project()),
            self.__tail).optimize()
Ejemplo n.º 2
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     return pynini.concat(
         pynini.transducer("",
                           "<Kompos_Stems>",
                           output_token_type=self.__syms.alphabet),
         pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Sg>",
                                 token_type=self.__syms.alphabet))),
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Pl>",
                                 token_type=self.__syms.alphabet))))), tmp),
         pynini.acceptor("<NN>", token_type=self.__syms.alphabet),
         pynini.transducer(
             "", "<kompos> <nativ>",
             output_token_type=self.__syms.alphabet)).optimize()
Ejemplo n.º 3
0
    def __construct_r1(self):
        '''
    Umlaut

    Apfel$ ==> Äpfel
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>",
                    "<NoDef>", "<UL>", "<FB>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # r1a
        tau = pynini.push(pynini.string_map(
            [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"),
             ("U", "Ü")],
            input_token_type=self.__syms.alphabet,
            output_token_type=self.__syms.alphabet),
                          push_labels=True)
        lc = pynini.union(
            self.__syms.consonants,
            pynini.string_map(
                ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()
        r1a = pynini.cdrewrite(
            tau, lc,
            pynini.concat(
                alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure())

        # r1c
        tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet)
        r1c = pynini.cdrewrite(
            tau,
            pynini.string_map(
                ["ä", "Ä"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            pynini.concat(
                self.__syms.consonants_lower, alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure()).optimize()

        # r1d
        r1d = pynini.cdrewrite(
            pynini.transducer("<UL>",
                              "<FB>",
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet), "", "",
            alphabet.closure())

        return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
Ejemplo n.º 4
0
    def __construct_r14(self):
        '''
    e-epenthesis 2
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        tau = pynini.transducer("<DEL-S>",
                                "e",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet)
        return pynini.cdrewrite(
            tau,
            pynini.union(
                pynini.concat(
                    pynini.string_map(
                        ["d", "t"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    pynini.acceptor("m",
                                    token_type=self.__syms.alphabet).closure(
                                        0, 1)),
                pynini.acceptor("t w", token_type=self.__syms.alphabet)), "",
            alphabet.closure()).optimize()
Ejemplo n.º 5
0
    def __construct_del_ge(self):
        '''
    Case-dependent deletion of the ge marker
    '''

        # delete <ge> at certain suffixes like 'ver'
        return pynini.concat(
            pynini.transducer("<no-ge>",
                              "",
                              input_token_type=self.__syms.alphabet),
            pynini.concat(
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.string_map(["<n>", "<e>", "<d>",
                                           "<~n>"]).project()).closure(),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        pynini.acceptor(
                            "<NoDef>",
                            token_type=self.__syms.alphabet).closure(0, 1),
                        pynini.transducer(
                            "<ge>", "", input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        self.__syms.stem_type_features,
                        pynini.acceptor(
                            "<nativ>",
                            token_type=self.__syms.alphabet))))).optimize()
Ejemplo n.º 6
0
def example0():
    # x is a vector of FST objects
    s = u"Pont l'Evêque"
    x = pynini.acceptor(s)
    print(u"Byte string acceptor from %s" % s)
    print(x)
    y = pynini.acceptor(u"Pont l'Evêque", token_type="utf8")
    print(u"utf8 string acceptor from %s" % s)
    print(y)
Ejemplo n.º 7
0
 def __construct_verbal_pref_stems(self):
     '''
 Verbal prefix stems
 '''
     return pynini.compose(
         self.__pref_stems,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Pref_Stems>",
                             token_type=self.__syms.alphabet),
             self.__sigma_star,
             pynini.acceptor("<V>", token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Ejemplo n.º 8
0
def load_lexicon(source, symbol_table):
    '''
  Load lexica entries from source interpreting them using a given symbol table.
  '''
    lex = pynini.Fst()
    lex.set_input_symbols(symbol_table)
    lex.set_output_symbols(symbol_table)
    # longest match, prefer complex over simple symbols
    tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U)
    for line in source:
        line = line.strip()
        if line:
            tmp = pynini.Fst()
            tmp.set_input_symbols(symbol_table)
            tmp.set_output_symbols(symbol_table)
            start = tmp.add_state()
            tmp.set_start(start)
            tmp.set_final(start)
            for token in tokenizer.findall(line):
                if token[1]:
                    tmp = pynini.concat(
                        tmp,
                        pynini.transducer(token[0],
                                          token[1],
                                          input_token_type=symbol_table,
                                          output_token_type=symbol_table))
                else:
                    tmp = pynini.concat(
                        tmp, pynini.acceptor(token[0],
                                             token_type=symbol_table))
            lex = pynini.union(lex, tmp)
    return lex
Ejemplo n.º 9
0
def process_window(input_str,
                   window_fst,
                   model,
                   pruning_weight=5,
                   rejection_weight=1.5):
    '''
    Compose a window input automaton with the model.
    '''
    t1 = time.time()
    window_fst.relabel_tables(new_isymbols=model[0].output_symbols(),
                              new_osymbols=model[0].output_symbols())
    for fst in model:
        window_fst = pynini.compose(window_fst, fst)
        window_fst.project(project_output=True)
        window_fst.prune(weight=pruning_weight)
        window_fst.optimize()
    t3 = time.time()
    logging.debug('- composition: {}s'.format(t3 - t1))
    # allow also identity for windows of length 1
    # (with weight `rejection_weight`)
    if ' ' not in input_str:
        # The formula:
        #    rejection_weight*(len(input_str)+2)
        # means that rejection_weight*2 is the initial cost of having an OOV
        # word (which is than more expensive with increasing length).
        # While discovered by accident, this turned out to work well as
        # a very naive OOV word model.
        window_fst.union(
            pynini.acceptor(escape_for_pynini(input_str),
                            weight=rejection_weight * (len(input_str) + 2)))
    t2 = time.time()
    logging.debug('Total processing time: {}s'.format(t2 - t1))
    return window_fst
Ejemplo n.º 10
0
def create_window(tokens):
    '''
    Create a window for the given input tokens (supplied as a list of
    strings).
    '''
    result = pynini.acceptor(escape_for_pynini(' '.join(tokens)))
    return result
Ejemplo n.º 11
0
 def rewrite(self, i: str) -> str:
     lattice = pynini.acceptor(i, token_type=self.token_type) @ self.rule
     if lattice.start() == pynini.NO_STATE_ID:
         logging.error("Composition failure: %s", i)
         return "<composition failure>"
     return pynini.shortestpath(lattice).stringify(
         token_type=self.token_type)
Ejemplo n.º 12
0
 def rewrite(self, i: str) -> str:
     lattice = (
         pynini.acceptor(i, token_type=self.input_token_type) @ self.fst)
     if lattice.start() == pynini.NO_STATE_ID:
         logging.error("Composition failure: %s", i)
         return "<composition failure>"
     lattice.project(True).rmepsilon()
     return pynini.shortestpath(lattice).string(self.output_token_type)
Ejemplo n.º 13
0
    def __construct_insert_zu(self):
        '''
    Inserts "zu" into infinitives with separable prefixes
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>",
                    "<^Gen>", "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(alphabet,
                          self.__syms.stem_types).closure().optimize()

        # From deko.fst:
        # insert "zu" after verbal prefixes if followed by infinitive marker
        return pynini.union(
            c2,
            #pynini.concat(
            #  pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet),
            #  alphabet.closure(),
            #  pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet),
            #  alphabet.closure()
            #  ),
            pynini.concat(
                c2,
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.acceptor("<Base_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.transducer("",
                                  "z u",
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^zz>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()
    def far_compile_string(self, string, lex_in, unknown_symbol):
        new_string = ""
        for w in string.split(" "):
            if not lex_in.member(w):
                new_string += unknown_symbol + " "
            else:
                new_string += w + " "
        new_string = new_string.strip()
        a = pynini.acceptor(new_string, token_type=lex_in)

        return a
Ejemplo n.º 15
0
def recombine_windows(window_fsts):
    '''
    Recombine processed window FSTs (containing hypotheses for a given
    window) to a lattice, which is also represented as an FST.
    '''
    def _label(pos, length):
        return 'WIN-{}-{}'.format(pos, length)

    t1 = time.time()
    space_tr = pynini.acceptor(' ')

    # determine the input string length and max. window size
    # (TODO without iterating!!!)
    num_tokens = max(i for (i, j) in window_fsts) + 1
    max_window_size = max(j for (i, j) in window_fsts)

    root = pynini.Fst()
    for i in range(num_tokens + 1):
        s = root.add_state()
    root.set_start(0)
    root.set_final(num_tokens, 0)

    # FIXME refactor the merging of symbol tables into a separate function
    symbol_table = pynini.SymbolTable()
    for window_fst in window_fsts.values():
        symbol_table = pynini.merge_symbol_table(symbol_table,
                                                 window_fst.input_symbols())
        symbol_table = pynini.merge_symbol_table(symbol_table,
                                                 window_fst.output_symbols())
    for (pos, length), window_fst in window_fsts.items():
        label = _label(pos, length)
        sym = symbol_table.add_symbol(label)

    root.set_input_symbols(symbol_table)
    root.set_output_symbols(symbol_table)

    replacements = []
    for (pos, length), window_fst in window_fsts.items():
        label = _label(pos, length)
        sym = root.output_symbols().find(label)
        if pos + length < num_tokens:
            # append a space if this is not the last token, so that the final
            # string consists of tokens separated by spaces
            window_fst.concat(space_tr)
        replacements.append((label, window_fst))
        root.add_arc(pos, pynini.Arc(0, sym, 0, pos + length))

    result = pynini.replace(root, replacements)
    result.optimize()

    t2 = time.time()
    logging.debug('Recombining time: {}s'.format(t2 - t1))

    return result
Ejemplo n.º 16
0
 def __construct_base_stems(self):
     '''
 Base stems
 '''
     return pynini.compose(
         self.__bdk_stems,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Base_Stems>",
                             token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Ejemplo n.º 17
0
def lexicon_to_window_fst(lexicon_fst, words_per_window=2):
    '''
    Concatenate the lexicon FST `words_per_window` times, inserting
    spaces in between. The resulting FST accepts up to
    `words_per_window` words from the lexicon.
    '''
    result = lexicon_fst.copy()
    if words_per_window == 1:
        return result
    result.concat(pynini.acceptor(' '))
    result.closure(0, words_per_window - 1)
    result.concat(lexicon_fst)
    return result
Ejemplo n.º 18
0
 def __construct_pref_deriv_suff_stems(self):
     '''
 Derivation suffixes which combine with prefixed stems
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Suff_Stems>",
                             token_type=self.__syms.alphabet),
             pynini.transducer("<prefderiv>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Ejemplo n.º 19
0
 def lookup(self, string):
     '''
 Analyse a string
 '''
     result = []
     if self.__verify():
         string_acceptor = pynini.acceptor(" ".join(c for c in string),
                                           token_type=self.__syms.alphabet)
         intermediate = pynini.compose(self.__timur, string_acceptor)
         paths = intermediate.paths(
             input_token_type=intermediate.input_symbols(),
             output_token_type=intermediate.output_symbols())
         result = list(paths.items())
     return result
Ejemplo n.º 20
0
 def __suff_stems_filter(self, features):
     '''
 Return a union over filters for each feature given
 '''
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.acceptor("<Suff_Stems>",
                                  token_type=self.__syms.alphabet)
     for feature in features:
         to_eps = pynini.transducer(feature,
                                    "",
                                    input_token_type=self.__syms.alphabet)
         filtering = pynini.union(filtering,
                                  pynini.concat(to_eps, suff_stems, to_eps))
     return filtering.optimize()
Ejemplo n.º 21
0
 def __construct_quant_suff_stems(self):
     '''
 Derivation suffixes which combine with a number and a simplex stem
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             pynini.transducer("<QUANT>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Suff_Stems>",
                             token_type=self.__syms.alphabet),
             pynini.transducer("<simplex>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
Ejemplo n.º 22
0
    def __init__(self, syms, lexicon):

        #
        # store alphabet
        self.__syms = syms

        #
        # store lexicon
        self.__lex = lexicon

        #
        # (private) helpers
        self.__sigma_star = pynini.union(
            syms.characters,
            syms.categories,
            syms.stem_types,
            syms.stem_type_features,
            syms.origin_features,
            syms.circumfix_features,
            syms.inflection_classes,
            syms.geo_inflection_classes,
            pynini.acceptor("<ge>", token_type=syms.alphabet
                            )  # for word-internal <ge> (ausgewertet)
        ).closure().optimize()

        #
        # NoDef2NULL
        self.__nodef_to_null = pynini.union(
            self.__sigma_star, syms.origin_features,
            pynini.transducer("<NoDef>",
                              "",
                              input_token_type=self.__syms.alphabet),
            syms.stem_types).closure().optimize()

        #
        # sublexica
        self.__bdk_stems = self.__construct_bdk_stems()
        self.__base_stems = self.__construct_base_stems()
        self.__pref_stems = self.__construct_pref_stems()
        self.__verbal_pref_stems = self.__construct_verbal_pref_stems()
        self.__simplex_suff_stems = self.__construct_simplex_suff_stems()
        self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems()
        self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems()
        self.__quant_suff_stems = self.__construct_quant_suff_stems()
Ejemplo n.º 23
0
    def __construct_participle_adj(self, tmp, sublexica):
        '''
    Stems for conversion of participles into adjectives
    '''
        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map([
                "<VPART>", "<VPREF>", "<PREF>", "<CONV>", "<SUFF>", "<NN>",
                "<ADJ>", "<V>", "<FT>"
            ],
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet).project(
                              )).closure().optimize()

        return pynini.concat(
            pynini.transducer("",
                              "<Base_Stems>",
                              output_token_type=self.__syms.alphabet),
            pynini.union(
                pynini.concat(
                    pynini.compose(
                        pynini.concat(
                            alphabet,
                            pynini.transducer(
                                "<V>",
                                "<+V>",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet),
                            pynini.acceptor(
                                "<zu>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                            pynini.acceptor("<PPast>",
                                            token_type=self.__syms.alphabet)),
                        pynini.compose(
                            tmp,
                            pynini.concat(
                                sublexica.nodef_to_null,
                                pynini.acceptor(
                                    "t", token_type=self.__syms.alphabet)))),
                    pynini.transducer("",
                                      "<ADJ>",
                                      output_token_type=self.__syms.alphabet),
                    pynini.transducer("<CONV>",
                                      "",
                                      input_token_type=self.__syms.alphabet),
                    pynini.transducer("",
                                      "<base> <nativ> <Adj+e>",
                                      output_token_type=self.__syms.alphabet)),
                pynini.concat(
                    pynini.compose(
                        pynini.concat(
                            alphabet,
                            pynini.transducer(
                                "<V>",
                                "<+V>",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet),
                            pynini.acceptor(
                                "<zu>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                            pynini.string_map(
                                ["<PPast>", "<PPres>"],
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet).
                            project()),
                        pynini.compose(
                            tmp,
                            pynini.concat(
                                sublexica.nodef_to_null,
                                pynini.acceptor(
                                    "e n", token_type=self.__syms.alphabet)
                                | pynini.acceptor(
                                    "n d", token_type=self.__syms.alphabet)))),
                    pynini.transducer("",
                                      "<ADJ>",
                                      output_token_type=self.__syms.alphabet),
                    pynini.transducer("<CONV>",
                                      "",
                                      input_token_type=self.__syms.alphabet),
                    pynini.transducer(
                        "",
                        "<base> <nativ> <Adj+>",
                        output_token_type=self.__syms.alphabet)))).optimize()
Ejemplo n.º 24
0
def get_paths(decode_graph, isymbs, osymbs, phoneme_list):
    phoneme_fst = pynini.acceptor(" ".join(phoneme_list), token_type = isymbs)
    return [path for path in pynini.compose(phoneme_fst, decode_graph).paths(input_token_type=isymbs, output_token_type=osymbs)]
Ejemplo n.º 25
0
    def __construct_ge_nom_stems_v(self, tmp):
        '''
    Stems for ge nominalization of verbs ("Gejammer")
    '''
        alphabet = pynini.union(
            self.__syms.characters, self.__syms.categories,
            pynini.string_map(
                ["<CONV>", "<SUFF>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # extract infinitives
        infinitives = pynini.compose(
            pynini.concat(
                pynini.concat(
                    self.__syms.characters.closure(1),
                    pynini.acceptor(
                        "<PREF>", token_type=self.__syms.alphabet)).closure(),
                alphabet.closure(1),
                pynini.transducer("",
                                  "<+V> <Inf>",
                                  output_token_type=self.__syms.alphabet)),
            tmp).optimize()

        insert_ge = pynini.concat(
            pynini.concat(
                self.__syms.characters.closure(1),
                pynini.acceptor("<PREF>",
                                token_type=self.__syms.alphabet)).closure(),
            pynini.transducer("g e <PREF> <Ge>",
                              "",
                              input_token_type=self.__syms.alphabet),
            alphabet.closure(1)).optimize()

        inserted_ge = pynini.compose(
            pynini.compose(insert_ge, infinitives).project(),
            pynini.union(
                self.__syms.to_lower, self.__syms.categories,
                self.__syms.prefix_suffix_marker,
                pynini.acceptor(
                    "<Ge>",
                    token_type=self.__syms.alphabet)).closure()).optimize()

        return pynini.concat(
            pynini.transducer("",
                              "<Deriv_Stems>",
                              output_token_type=self.__syms.alphabet),
            pynini.compose(
                pynini.compose(
                    pynini.compose(
                        pynini.union(
                            alphabet,
                            pynini.acceptor("<PREF>",
                                            token_type=self.__syms.alphabet),
                            pynini.transducer("",
                                              "<Ge>",
                                              output_token_type=self.__syms.
                                              alphabet)).closure(),
                        inserted_ge),
                    pynini.union(
                        self.__syms.characters,
                        pynini.acceptor("<Ge>",
                                        token_type=self.__syms.alphabet),
                        pynini.transducer(
                            pynini.union(self.__syms.categories,
                                         self.__syms.prefix_suffix_marker),
                            "")).closure()),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.acceptor("<Ge>",
                                        token_type=self.__syms.alphabet),
                    ).closure(1),
                    pynini.transducer("e n",
                                      "",
                                      input_token_type=self.__syms.alphabet))),
            pynini.acceptor("<V>", token_type=self.__syms.alphabet),
            pynini.transducer(
                "", "<deriv> <nativ>",
                output_token_type=self.__syms.alphabet)).optimize()
Ejemplo n.º 26
0
                                              ["t", "d"]])
##Courtesy of http://www.lysator.liu.se/language/Languages/Finnish/Grammar.html and https://web.stanford.edu/~kiparsky/Papers/finnish.article.pdf
consonant_reduction = pynini.cdrewrite(double_consonants_reduce,
                                       "l" | vowels | "n", vowels + suffixes,
                                       closure).optimize()

#Vowel insertion to break consonant clusters caused by suffixes
insertion = pynini.cdrewrite(pynini.transducer("", "e"), consonants, suffixes,
                             closure).optimize()

#Finnish seems to attempt preserving morae count with /s/ as a syllabic end.  Generates a stop that assimilates 'highness' of vowel and becomes /k/
#In case this generated stop occurs after VV, it instead assimilates /s/ and becomes /t/.  Then gradation occurs due to /e/ insertion
#Similar situation seemed to occur with /s/ -> /a/ / /a/_ + suffix.  So was added to transducer.
final_stress_preservation = pynini.cdrewrite(
    pynini.transducer("s", "t"), vowels +
    (pynini.acceptor("y") | "u"), suffixes, closure) * pynini.cdrewrite(
        pynini.transducer("", "k"),
        pynini.acceptor("y") | "u",
        "s" + suffixes, closure) * pynini.cdrewrite(
            pynini.transducer("s", "a"), "a", suffixes, closure)
final_stress_preservation.optimize()

#Rule for /nt/ assimilation.
nt_assimilation = pynini.cdrewrite(pynini.transducer("t", "n"), "n",
                                   vowels + suffixes, closure).optimize()

#Intersection of rules
transducer_adessive = regularize * transducer_adessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize
transducer_inessive = regularize * transducer_inessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize

#########################Generates FAR ###############################3
Ejemplo n.º 27
0
####

dir_path = os.path.dirname(os.path.realpath(__file__))

ST = pynini.SymbolTable.read_text(f"{dir_path}/syms.txt")


def draw(x, opt=True):
    if opt: x.optimize()
    x.draw(f"{dir_path}/obdd.dot", ST, ST, portrait=True, acceptor=True)


####

# constants
T = pynini.acceptor("T", token_type=ST)
F = pynini.acceptor("F", token_type=ST)
ANY = T | F
TAUT = (T | F).closure()  # sigma-star
ABSURD = T - T


# boolean operators
def AND(*args):
    return reduce(pynini.intersect, map(v, args), TAUT)


def OR(*args):
    return reduce(pynini.union, map(v, args), ABSURD)

Ejemplo n.º 28
0
    def __construct_uplow(self):
        '''
    Upper/Lower case markers
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoDef>", "<FB>", "<UL>",
                    "<SS>", "<DEL-S>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>",
                    "<^imp>", "<ge>", "<^zz>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        s = pynini.concat(
            alphabet,
            pynini.union(
                alphabet,
                pynini.acceptor(
                    "<CB>",
                    token_type=self.__syms.alphabet)).closure()).optimize()

        s2 = pynini.concat(
            pynini.union(
                pynini.concat(
                    pynini.transducer("<CB>",
                                      "",
                                      input_token_type=self.__syms.alphabet),
                    self.__syms.characters_upper),
                pynini.concat(
                    pynini.transducer(
                        "<CB>", "",
                        input_token_type=self.__syms.alphabet).closure(0, 1),
                    self.__syms.characters_lower)), s).optimize()

        return pynini.union(
            pynini.concat(
                pynini.transducer("<^UC>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                pynini.string_map(
                    ["<NoDef>", "<NoHy>"],
                    input_token_type=self.__syms.alphabet,
                    output_token_type=self.__syms.alphabet).project().closure(
                        0, 1),
                pynini.transducer("",
                                  "<^UC>",
                                  output_token_type=self.__syms.alphabet), s2,
                pynini.transducer("<Low#>",
                                  "",
                                  input_token_type=self.__syms.alphabet)),
            pynini.concat(
                pynini.acceptor("<NoHy>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                pynini.union(
                    pynini.concat(
                        pynini.transducer(
                            "<CB>", "", input_token_type=self.__syms.alphabet),
                        s,
                        pynini.transducer(
                            "<Fix#>",
                            "",
                            input_token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(pynini.string_map(
                            ["<CB>", "<epsilon>"],
                            input_token_type=self.__syms.alphabet,
                            output_token_type=self.__syms.alphabet).project(),
                                          "<^UC>",
                                          output_token_type=self.__syms.
                                          alphabet), s,
                        pynini.transducer(
                            "<Up#>", "",
                            input_token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(pynini.string_map(
                            ["<CB>", "<epsilon>"],
                            input_token_type=self.__syms.alphabet,
                            output_token_type=self.__syms.alphabet).project(),
                                          "<CB>",
                                          output_token_type=self.__syms.
                                          alphabet), s,
                        pynini.transducer("<Low#>",
                                          "",
                                          input_token_type=self.__syms.alphabet
                                          ))))).optimize()
Ejemplo n.º 29
0
    def __construct_prefix_origin_filter(self):
        '''
    Match origin of prefix and stem
    '''

        return pynini.concat(
            pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet),
            pynini.concat(
                pynini.union(
                    self.__syms.characters,
                    pynini.string_map([
                        "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>"
                    ],
                                      input_token_type=self.__syms.alphabet,
                                      output_token_type=self.__syms.alphabet).
                    project()).closure(),
                pynini.union(
                    pynini.concat(
                        pynini.transducer(
                            "<ADJ> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<ADJ>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<nativ>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<ABK> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<ABK>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<nativ>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<NN> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<NN>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<nativ>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<NN> <fremd>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<NN>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<fremd>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<NE> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<NE>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<nativ>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<NE> <fremd>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<NE>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<fremd>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<ADJ> <fremd>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<ADJ>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<fremd>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<V>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.acceptor("<nativ>",
                                        token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<V>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        self.__syms.ns_features),
                    pynini.concat(
                        pynini.transducer(
                            "<ADJ> <klassisch>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<ADJ>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.string_map(
                            ["<frei>", "<gebunden>", "<kurz>", "<lang>"],
                            input_token_type=self.__syms.alphabet,
                            output_token_type=self.__syms.alphabet).project()),
                    pynini.concat(
                        pynini.transducer(
                            "<NN> <klassisch>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<NN>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.string_map(
                            ["<frei>", "<gebunden>", "<kurz>", "<lang>"],
                            input_token_type=self.__syms.alphabet,
                            output_token_type=self.__syms.alphabet).project()),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <klassisch>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        pynini.acceptor("<V>",
                                        token_type=self.__syms.alphabet),
                        self.__syms.stem_type_features,
                        pynini.string_map(
                            ["<frei>", "<gebunden>", "<kurz>", "<lang>"],
                            input_token_type=self.__syms.alphabet,
                            output_token_type=self.__syms.alphabet).project()))
            )).optimize()
Ejemplo n.º 30
0
    def __construct_insert_ge(self):
        '''
    Inserts the prefix "ge" controlled by the symbol "<ge>"
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^imp>", "<^zz>", "<^UC>", "<^Ax>",
                    "<^pl>", "<^Gen>", "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(alphabet,
                          self.__syms.stem_types).closure().optimize()

        # From deko.fst:
        # replace <ge> with "ge" if followed by perfect participle marker
        # or ge-nominalisation otherwise delete <ge>
        # in complex lexicon entries as for "haushalten" <ge> is not followed
        # by <Base_Stems>
        return pynini.union(
            c2,
            pynini.concat(
                c2,
                pynini.transducer("<ge>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                pynini.acceptor("<Base_Stems>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                pynini.transducer("",
                                  "g e",
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^pp>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure()),
            pynini.concat(
                c2,
                pynini.acceptor("<Deriv_Stems>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                alphabet.closure(),
                pynini.transducer("<Ge>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<Suff_Stems> <Ge-Nom>",
                                  "e",
                                  input_token_type=self.__syms.alphabet,
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure()),
            pynini.concat(
                c2,
                pynini.transducer("<ge>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                pynini.acceptor("<Base_Stems>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                alphabet.closure()),
            pynini.concat(
                c2,
                pynini.acceptor("<Base_Stems>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                alphabet.closure(),
                pynini.transducer("<^pp>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()
Ejemplo n.º 31
0
    def __construct_compound_filter(self):
        '''
    Construct the compound filter
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
            pynini.transducer(self.__syms.categories, ""),
            pynini.transducer(self.__syms.origin_features, ""),
            pynini.transducer("<NoPref>",
                              "",
                              input_token_type=self.__syms.alphabet))

        return pynini.concat(
            pynini.union(
                pynini.transducer("<Initial>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                pynini.acceptor("<NoHy>", token_type=self.__syms.alphabet),
                pynini.acceptor("<NoDef>",
                                token_type=self.__syms.alphabet)).closure(
                                    0, 1),
            pynini.concat(
                pynini.union(
                    pynini.concat(
                        alphabet.closure(),
                        pynini.transducer(
                            pynini.string_map(
                                [
                                    "<ABK>", "<ADV>", "<CARD>", "<NE>",
                                    "<PRO>", "<V>", "<ORD>", "<OTHER>"
                                ],
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet).
                            project(), "")),
                    pynini.concat(
                        pynini.transducer(
                            "",
                            "<VADJ>",
                            output_token_type=self.__syms.alphabet),
                        pynini.union(
                            alphabet,
                            pynini.transducer("<kompos>",
                                              "",
                                              input_token_type=self.__syms.
                                              alphabet)).closure(),
                        pynini.transducer(
                            "<kompos>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        alphabet.closure(),
                        pynini.transducer(
                            "<V>", "", input_token_type=self.__syms.alphabet)),
                    pynini.concat(
                        pynini.union(
                            alphabet,
                            pynini.transducer("<kompos>",
                                              "",
                                              input_token_type=self.__syms.
                                              alphabet)).closure(),
                        pynini.transducer(
                            pynini.string_map(
                                ["<ADJ>", "<NN>"],
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet).
                            project(), ""))),
                pynini.concat(
                    pynini.transducer("<base>",
                                      "",
                                      input_token_type=self.__syms.alphabet),
                    pynini.transducer(self.__syms.origin_features, ""),
                    self.__syms.inflection_classes))).optimize()