コード例 #1
0
    def __construct_del_ge(self):
        '''
    Case-dependent deletion of the ge marker
    '''

        # delete <ge> at certain suffixes like 'ver'
        return pynini.concat(
            pynini.transducer("<no-ge>",
                              "",
                              input_token_type=self.__syms.alphabet),
            pynini.concat(
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.string_map(["<n>", "<e>", "<d>",
                                           "<~n>"]).project()).closure(),
                    pynini.concat(
                        pynini.transducer(
                            "<V> <nativ>",
                            "",
                            input_token_type=self.__syms.alphabet),
                        pynini.acceptor(
                            "<NoDef>",
                            token_type=self.__syms.alphabet).closure(0, 1),
                        pynini.transducer(
                            "<ge>", "", input_token_type=self.__syms.alphabet),
                        self.__prefix_filter_helper,
                        self.__syms.stem_type_features,
                        pynini.acceptor(
                            "<nativ>",
                            token_type=self.__syms.alphabet))))).optimize()
コード例 #2
0
  def __init__(self,
               alphabet,
               insert_cost=DEFAULT_INSERT_COST,
               delete_cost=DEFAULT_DELETE_COST,
               substitute_cost=DEFAULT_SUBSTITUTE_COST):
    """Constructor.

    Args:
      alphabet: edit alphabet (an iterable of strings).
      insert_cost: the cost for the insertion operation.
      delete_cost: the cost for the deletion operation.
      substitute_cost: the cost for the substitution operation.
    """
    # Left factor; note that we divide the edit costs by two because they also
    # will be incurred when traversing the right factor.
    match = union(*alphabet).optimize(True)
    i_insert = transducer("", "[{}]".format(self.INSERT),
                          weight=insert_cost / 2).optimize(True)
    i_delete = transducer(match, "[{}]".format(self.DELETE),
                          weight=delete_cost / 2).optimize(True)
    i_substitute = transducer(match, "[{}]".format(self.SUBSTITUTE),
                              weight=substitute_cost / 2).optimize(True)
    i_ops = union(match, i_insert, i_delete, i_substitute).optimize(True)
    # Right factor; this is constructed by inverting the left factor (i.e.,
    # swapping the input and output labels), then swapping the insert and delete
    # labels on what is now the input side.
    o_ops = invert(i_ops)
    syms = o_ops.input_symbols()
    insert_label = syms.find(self.INSERT)
    delete_label = syms.find(self.DELETE)
    o_ops.relabel_pairs(ipairs=((insert_label, delete_label),
                                (delete_label, insert_label)))
    # Computes the closure for both sets of ops.
    self._e_i = i_ops.closure().optimize(True)
    self._e_o = o_ops.closure().optimize(True)
コード例 #3
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     return pynini.concat(
         pynini.transducer("",
                           "<Kompos_Stems>",
                           output_token_type=self.__syms.alphabet),
         pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Sg>",
                                 token_type=self.__syms.alphabet))),
                     pynini.transducer(
                         "",
                         pynini.concat(
                             pynini.acceptor(
                                 "<+NN>", token_type=self.__syms.alphabet),
                             self.__syms.gender,
                             pynini.acceptor(
                                 "<Nom> <Pl>",
                                 token_type=self.__syms.alphabet))))), tmp),
         pynini.acceptor("<NN>", token_type=self.__syms.alphabet),
         pynini.transducer(
             "", "<kompos> <nativ>",
             output_token_type=self.__syms.alphabet)).optimize()
コード例 #4
0
    def __construct_r1(self):
        '''
    Umlaut

    Apfel$ ==> Äpfel
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<UL>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<e>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>",
                    "<NoDef>", "<UL>", "<FB>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # r1a
        tau = pynini.push(pynini.string_map(
            [("a", "ä"), ("o", "ö"), ("u", "ü"), ("A", "Ä"), ("O", "Ö"),
             ("U", "Ü")],
            input_token_type=self.__syms.alphabet,
            output_token_type=self.__syms.alphabet),
                          push_labels=True)
        lc = pynini.union(
            self.__syms.consonants,
            pynini.string_map(
                ["<CB>", "<WB>", "<NoHy>", "<NoDef>", "<^UC>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()
        r1a = pynini.cdrewrite(
            tau, lc,
            pynini.concat(
                alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure())

        # r1c
        tau = pynini.transducer("a", "", input_token_type=self.__syms.alphabet)
        r1c = pynini.cdrewrite(
            tau,
            pynini.string_map(
                ["ä", "Ä"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            pynini.concat(
                self.__syms.consonants_lower, alphabet.closure(),
                pynini.acceptor("<UL>", token_type=self.__syms.alphabet)),
            alphabet.closure()).optimize()

        # r1d
        r1d = pynini.cdrewrite(
            pynini.transducer("<UL>",
                              "<FB>",
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet), "", "",
            alphabet.closure())

        return pynini.compose(r1a, pynini.compose(r1c, r1d)).optimize()
コード例 #5
0
    def __init__(self, syms, sublexica, deko_filter, inflection, phon):

        #
        # store alphabet
        self.__syms = syms

        #
        # run parts of morphology building (cf. timur_fst)
        tmp = (sublexica.verbal_pref_stems + sublexica.base_stems
               ) * sublexica.nodef_to_null * deko_filter.pref_filter
        tmp = (sublexica.base_stems | tmp) * deko_filter.compound_filter

        # ANY TODO: Move to symbols!
        alphabet = pynini.union(
            syms.characters, syms.stem_types,
            pynini.string_map([
                "<FB>", "<SS>", "<n>", "<~n>", "<e>", "<d>", "<Ge-Nom>",
                "<UL>", "<NoHy>", "<NoDef>", "<ge>", "<Ge>", "<no-ge>", "<CB>"
            ],
                              input_token_type=syms.alphabet,
                              output_token_type=syms.alphabet).project()
        ).closure().optimize()

        tmp = (tmp + inflection.inflection) * (
            alphabet + inflection.inflection_filter
        ) * deko_filter.infix_filter * deko_filter.uplow

        tmp = pynini.compose(
            pynini.concat(
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
                tmp,
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
            ), phon.phon).optimize()

        #
        # default stems

        # create a default composition stem for nouns
        self.__compound_stems_nn = self.__construct_compound_stems_nn(tmp)

        # create a deriv stem for Ge nominalization (Gelerne)
        self.__ge_nom_stems_v = self.__construct_ge_nom_stems_v(tmp)

        # create an adjective base stem from participles
        self.__participle_adj = self.__construct_participle_adj(tmp, sublexica)
        self.__participle_adj.draw("participle_adj.dot", portrait=True)
コード例 #6
0
 def transducer(cls, fsm1, fsm2):
     if not isinstance(fsm1, cls):
         fsm1 = PyniniWrapper.fromItem(fsm1)
     if not isinstance(fsm2, PyniniWrapper):
         fsm2 = PyniniWrapper.fromItem(fsm2)
     fsm = pynini.transducer(fsm1.fsm, fsm2.fsm)
     return cls(fsm)
コード例 #7
0
    def __construct_r21(self):
        '''
    Low to up

    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        self.__syms.to_upper.draw("to_upper.dot")
        # Construction in SFST involves negation (which is expensiv).
        # It looks like we can do better:
        return pynini.push(pynini.union(
            alphabet.closure(),
            pynini.concat(
                pynini.transducer(
                    "<^UC>", "",
                    input_token_type=self.__syms.alphabet).closure(1),
                pynini.union(
                    pynini.string_map(
                        ["<NoHy>", "<NoDef>"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    self.__syms.to_upper))).closure(),
                           push_labels=True).optimize()
コード例 #8
0
    def __construct_r20(self):
        '''
    Up to low

    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<^UC>", "<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        #
        # SFST uses a rewrite rule here
        return pynini.push(pynini.union(
            alphabet.closure(),
            pynini.concat(
                pynini.transducer(
                    "<CB>", "",
                    input_token_type=self.__syms.alphabet).closure(1),
                pynini.union(
                    pynini.string_map(
                        ["<^UC>", "<NoHy>", "<NoDef>"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    self.__syms.to_lower))).closure(),
                           push_labels=True).optimize()
コード例 #9
0
    def __construct_r14(self):
        '''
    e-epenthesis 2
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        tau = pynini.transducer("<DEL-S>",
                                "e",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet)
        return pynini.cdrewrite(
            tau,
            pynini.union(
                pynini.concat(
                    pynini.string_map(
                        ["d", "t"],
                        input_token_type=self.__syms.alphabet,
                        output_token_type=self.__syms.alphabet).project(),
                    pynini.acceptor("m",
                                    token_type=self.__syms.alphabet).closure(
                                        0, 1)),
                pynini.acceptor("t w", token_type=self.__syms.alphabet)), "",
            alphabet.closure()).optimize()
コード例 #10
0
def load_lexicon(source, symbol_table):
    '''
  Load lexica entries from source interpreting them using a given symbol table.
  '''
    lex = pynini.Fst()
    lex.set_input_symbols(symbol_table)
    lex.set_output_symbols(symbol_table)
    # longest match, prefer complex over simple symbols
    tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U)
    for line in source:
        line = line.strip()
        if line:
            tmp = pynini.Fst()
            tmp.set_input_symbols(symbol_table)
            tmp.set_output_symbols(symbol_table)
            start = tmp.add_state()
            tmp.set_start(start)
            tmp.set_final(start)
            for token in tokenizer.findall(line):
                if token[1]:
                    tmp = pynini.concat(
                        tmp,
                        pynini.transducer(token[0],
                                          token[1],
                                          input_token_type=symbol_table,
                                          output_token_type=symbol_table))
                else:
                    tmp = pynini.concat(
                        tmp, pynini.acceptor(token[0],
                                             token_type=symbol_table))
            lex = pynini.union(lex, tmp)
    return lex
コード例 #11
0
    def __construct_suff_phon(self):
        '''
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>",
                    "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>",
                    "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project(),
            self.__syms.stem_types,
        ).closure()

        Tau = pynini.transducer("i", "", input_token_type=self.__syms.alphabet)
        Lambda = pynini.concat(
            pynini.union(
                pynini.acceptor("i", token_type=self.__syms.alphabet),
                pynini.concat(
                    self.__syms.consonants.project(),
                    pynini.acceptor("y", token_type=self.__syms.alphabet))),
            pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet))

        return pynini.concat(
            pynini.cdrewrite(Tau, Lambda, "", alphabet.project()),
            self.__tail).optimize()
コード例 #12
0
def main(args: argparse.Namespace) -> None:
    # Sets of labels for the covering grammar.
    g_labels: Set[int] = set()
    p_labels: Set[int] = set()
    # Curries compiler and compactor functions for the FARs.
    compiler = functools.partial(pynini.acceptor,
                                 token_type=args.token_type,
                                 attach_symbols=False)
    compactor = functools.partial(pywrapfst.convert, fst_type="compact_string")
    logging.info("Constructing grapheme and phoneme FARs")
    g_writer = pywrapfst.FarWriter.create(args.g_far_path)
    p_writer = pywrapfst.FarWriter.create(args.p_far_path)
    with open(args.input_path, "r") as source:
        for (linenum, line) in enumerate(source, 1):
            key = f"{linenum:08x}"
            (g, p) = line.rstrip().split("\t", 1)
            # For both G and P, we compile a FSA, store the labels, and then
            # write the compact version to the FAR.
            g_fst = compiler(g)
            g_labels.update(g_fst.paths().ilabels())
            g_writer[key] = compactor(g_fst)
            p_fst = compiler(p)
            p_labels.update(p_fst.paths().ilabels())
            p_writer[key] = compactor(p_fst)
    logging.info("Processed %d examples", linenum)
    logging.info("Constructing covering grammar")
    logging.info("%d unique graphemes", len(g_labels))
    g_side = _label_union(g_labels, args.input_epsilon)
    logging.info("%d unique phonemes", len(p_labels))
    p_side = _label_union(p_labels, args.output_epsilon)
    # The covering grammar is given by (G x P)^*, a zeroth order Markov model.
    covering = pynini.transducer(g_side, p_side).closure().optimize()
    assert covering.num_states() == 1, "Covering grammar FST is ill-formed"
    logging.info("Covering grammar has %d arcs", _narcs(covering))
    covering.write(args.covering_path)
コード例 #13
0
 def __construct_rep_pref(self):
     '''
 Replace the marker of manually prefixed stems
 '''
     return pynini.cdrewrite(pynini.transducer("<prefnativ>",
                                               "<nativ>"), "", "",
                             self.__prefix_filter_helper).optimize()
コード例 #14
0
    def __construct_r13(self):
        '''
    e-epenthesis 1
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<CB>", "<FB>", "<DEL-S>", "<SS>", "<WB>", "<^UC>",
                    "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<NoHy>", "<NoDef>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        return pynini.union(
            alphabet,
            pynini.transducer(
                pynini.string_map(
                    [
                        "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>",
                        "<^Ax>", "<WB>"
                    ],
                    input_token_type=self.__syms.alphabet,
                    output_token_type=self.__syms.alphabet).project(),
                "")).closure().optimize()
コード例 #15
0
ファイル: map_fst.py プロジェクト: gitter-badger/timur
 def __split_disjunctive_feats(self, disjunctive_feat_list):
   single_splits = []
   for disjunctive_feat in disjunctive_feat_list:
     splitted = []
     for cat in disjunctive_feat[1:-1].split(","):
       splitted.append("<" + cat + ">")
       single_splits.append(pynini.transducer(disjunctive_feat, pynini.string_map(splitted, input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet))
   return pynini.union(*(single_splits)).optimize()
コード例 #16
0
ファイル: sublexica.py プロジェクト: gitter-badger/timur
 def __construct_quant_suff_stems(self):
     '''
 Derivation suffixes which combine with a number and a simplex stem
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             pynini.transducer("<QUANT>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Suff_Stems>",
                             token_type=self.__syms.alphabet),
             pynini.transducer("<simplex>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
コード例 #17
0
    def __construct_insert_zu(self):
        '''
    Inserts "zu" into infinitives with separable prefixes
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>",
                    "<^Gen>", "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(alphabet,
                          self.__syms.stem_types).closure().optimize()

        # From deko.fst:
        # insert "zu" after verbal prefixes if followed by infinitive marker
        return pynini.union(
            c2,
            #pynini.concat(
            #  pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet),
            #  alphabet.closure(),
            #  pynini.transducer("<^zz>", "", input_token_type=self.__syms.alphabet),
            #  alphabet.closure()
            #  ),
            pynini.concat(
                c2,
                pynini.acceptor("<Pref_Stems>",
                                token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.acceptor("<Base_Stems>",
                                token_type=self.__syms.alphabet),
                pynini.transducer("",
                                  "z u",
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^zz>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()
コード例 #18
0
ファイル: create_fst.py プロジェクト: Ulitochka/FOMA
    def __init__(self):

        super().__init__()

        attr_map_0 = pynini.transducer(self.cable_digits, '#жил')
        attr_map_1 = pynini.transducer(self.cable_digits, 'Длина_кабеля')
        attr_map_2 = pynini.transducer(self.cable_digits, 'Диаметр')
        attr_map_3 = pynini.transducer(self.cable_digits, '#соединительных_проводов')
        attr_map_4 = pynini.transducer(' . ', ' Диаметр ')

        attr_map_0_lc = self.ngram_comb
        attr_map_0_rc = self.cable_splitters
        attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize()

        attr_map_3_lc = self.cable_splitters
        attr_map_3_rc = self.cable_splitters
        attr_map_3_s = pynini.cdrewrite(attr_map_3, attr_map_3_lc, attr_map_3_rc, self.alphabet).optimize()

        attr_map_2_lc = self.cable_floats
        attr_map_2_rc = self.cable_length_0
        attr_map_2_s = pynini.cdrewrite(attr_map_2, attr_map_2_lc, attr_map_2_rc, self.alphabet).optimize()

        attr_map_4_lc = self.cable_splitters
        attr_map_4_rc = self.cable_floats
        attr_map_4_s = pynini.cdrewrite(attr_map_2, attr_map_4_lc, attr_map_4_rc, self.alphabet).optimize()

        attr_map_5_lc = self.cable_digits
        attr_map_5_rc = self.cable_digits
        attr_map_5_s = pynini.cdrewrite(attr_map_4, attr_map_5_lc, attr_map_5_rc, self.alphabet).optimize()

        attr_map_6_rc = self.cable_length_0
        attr_map_6_lc = self.cable_length_1
        attr_map_6_s = pynini.cdrewrite(attr_map_1, attr_map_6_rc, attr_map_6_lc, self.alphabet).optimize()

        attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_2_s, attr_map_4_s).optimize(), attr_map_5_s).optimize()

        self.rules = {
            'жилы': attr_map_0_s,
            'соединительные_провода': attr_map_3_s,
            'диаметр': attr_map_comp_0,
            'длина_кабеля': attr_map_6_s
        }
コード例 #19
0
ファイル: create_fst.py プロジェクト: Ulitochka/FOMA
    def __init__(self):
        super().__init__()

        attr_map_0 = pynini.transducer(self.cable_digits, '#жил')
        attr_map_1 = pynini.transducer(self.cable_digits, 'Сечение_кабеля')
        attr_map_2 = pynini.transducer(self.cable_digits, 'Длина_кабеля')
        attr_map_3 = pynini.transducer('.', 'Сечение_кабеля')

        attr_map_0_rc = self.cable_splitters
        attr_map_0_lc = pynini.union(" ")
        attr_map_0_s = pynini.cdrewrite(attr_map_0, attr_map_0_lc, attr_map_0_rc, self.alphabet).optimize()

        attr_map_1_rc = self.cable_splitters
        attr_map_1_lc = self.cable_floats
        attr_map_1_s = pynini.cdrewrite(attr_map_1, attr_map_1_rc, attr_map_1_lc, self.alphabet).optimize()

        attr_map_2_rc = self.cable_floats
        attr_map_2_lc = self.cable_length_0
        attr_map_2_s = pynini.cdrewrite(attr_map_1, attr_map_2_rc, attr_map_2_lc, self.alphabet).optimize()

        attr_map_3_rc = self.cable_length_0
        attr_map_3_lc = self.cable_length_1
        attr_map_3_s = pynini.cdrewrite(attr_map_2, attr_map_3_rc, attr_map_3_lc, self.alphabet).optimize()

        attr_map_4_rc = self.cable_digits
        attr_map_4_lc = self.cable_digits
        attr_map_4_s = pynini.cdrewrite(attr_map_3, attr_map_4_rc, attr_map_4_lc, self.alphabet).optimize()

        attr_map_5_rc = self.cable_splitters
        attr_map_5_lc = self.cable_length_0
        attr_map_5_s = pynini.cdrewrite(attr_map_1, attr_map_5_rc, attr_map_5_lc, self.alphabet).optimize()

        attr_map_comp_0 = pynini.compose(pynini.compose(attr_map_1_s, attr_map_2_s).optimize(), attr_map_4_s).optimize()

        self.rules = {
            'жилы': attr_map_0_s,
            'сечение_кабеля_0': attr_map_comp_0,
            'длина_кабеля': attr_map_3_s,
            'сечение_кабеля_1': attr_map_5_s
        }
コード例 #20
0
ファイル: sublexica.py プロジェクト: gitter-badger/timur
 def __construct_pref_deriv_suff_stems(self):
     '''
 Derivation suffixes which combine with prefixed stems
 '''
     return pynini.compose(
         self.__lex,
         pynini.concat(
             self.__syms.initial_features.closure(),
             pynini.acceptor("<Suff_Stems>",
                             token_type=self.__syms.alphabet),
             pynini.transducer("<prefderiv>",
                               "",
                               input_token_type=self.__syms.alphabet),
             self.__sigma_star)).optimize()
コード例 #21
0
    def __construct_imperative_filter(self):
        '''
    Imperatives have no separable prefixes
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>",
                    "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(
            alphabet,
            pynini.transducer(
                self.__syms.stem_types,
                "<CB>",
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet)).closure().optimize()

        return pynini.union(
            c2,
            pynini.concat(
                pynini.transducer("<Base_Stems>",
                                  "<CB>",
                                  input_token_type=self.__syms.alphabet,
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^imp>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()
コード例 #22
0
 def _lexicon_covering(
     self,
     tsv_path: str,
     input_token_type: TokenType,
     input_epsilon: bool,
     output_token_type: TokenType,
     output_epsilon: bool,
 ) -> None:
     """Builds covering grammar and lexicon FARs."""
     # Sets of labels for the covering grammar.
     g_labels: Set[int] = set()
     p_labels: Set[int] = set()
     # Curries compiler functions for the FARs.
     icompiler = functools.partial(
         pynini.acceptor, token_type=input_token_type
     )
     ocompiler = functools.partial(
         pynini.acceptor, token_type=output_token_type
     )
     logging.info("Constructing grapheme and phoneme FARs")
     g_writer = pywrapfst.FarWriter.create(self.g_path)
     p_writer = pywrapfst.FarWriter.create(self.p_path)
     with open(tsv_path, "r") as source:
         for (linenum, line) in enumerate(source, 1):
             key = f"{linenum:08x}"
             (g, p) = line.rstrip().split("\t", 1)
             # For both G and P, we compile a FSA, store the labels, and
             # then write the compact version to the FAR.
             g_fst = icompiler(g)
             g_labels.update(g_fst.paths().ilabels())
             g_writer[key] = self._compactor(g_fst)
             p_fst = ocompiler(p)
             p_labels.update(p_fst.paths().ilabels())
             p_writer[key] = self._compactor(p_fst)
     logging.info("Processed %s examples", f"{linenum:,d}")
     logging.info("Constructing covering grammar")
     logging.info("%d unique graphemes", len(g_labels))
     g_side = self._label_union(g_labels, input_epsilon)
     logging.info("%d unique phones", len(p_labels))
     p_side = self._label_union(p_labels, output_epsilon)
     # The covering grammar is given by (G x P)^*.
     covering = pynini.transducer(g_side, p_side).closure().optimize()
     assert covering.num_states() == 1, "Covering grammar FST is ill-formed"
     logging.info(
         "Covering grammar has %s arcs",
         f"{PairNGramAligner._narcs(covering):,d}",
     )
     covering.write(self.c_path)
コード例 #23
0
 def __suff_stems_filter(self, features):
     '''
 Return a union over filters for each feature given
 '''
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.acceptor("<Suff_Stems>",
                                  token_type=self.__syms.alphabet)
     for feature in features:
         to_eps = pynini.transducer(feature,
                                    "",
                                    input_token_type=self.__syms.alphabet)
         filtering = pynini.union(filtering,
                                  pynini.concat(to_eps, suff_stems, to_eps))
     return filtering.optimize()
コード例 #24
0
ファイル: sublexica.py プロジェクト: gitter-badger/timur
    def __init__(self, syms, lexicon):

        #
        # store alphabet
        self.__syms = syms

        #
        # store lexicon
        self.__lex = lexicon

        #
        # (private) helpers
        self.__sigma_star = pynini.union(
            syms.characters,
            syms.categories,
            syms.stem_types,
            syms.stem_type_features,
            syms.origin_features,
            syms.circumfix_features,
            syms.inflection_classes,
            syms.geo_inflection_classes,
            pynini.acceptor("<ge>", token_type=syms.alphabet
                            )  # for word-internal <ge> (ausgewertet)
        ).closure().optimize()

        #
        # NoDef2NULL
        self.__nodef_to_null = pynini.union(
            self.__sigma_star, syms.origin_features,
            pynini.transducer("<NoDef>",
                              "",
                              input_token_type=self.__syms.alphabet),
            syms.stem_types).closure().optimize()

        #
        # sublexica
        self.__bdk_stems = self.__construct_bdk_stems()
        self.__base_stems = self.__construct_base_stems()
        self.__pref_stems = self.__construct_pref_stems()
        self.__verbal_pref_stems = self.__construct_verbal_pref_stems()
        self.__simplex_suff_stems = self.__construct_simplex_suff_stems()
        self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems()
        self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems()
        self.__quant_suff_stems = self.__construct_quant_suff_stems()
コード例 #25
0
    def __construct_ge_nom_stems_v(self, tmp):
        '''
    Stems for ge nominalization of verbs ("Gejammer")
    '''
        alphabet = pynini.union(
            self.__syms.characters, self.__syms.categories,
            pynini.string_map(
                ["<CONV>", "<SUFF>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        # extract infinitives
        infinitives = pynini.compose(
            pynini.concat(
                pynini.concat(
                    self.__syms.characters.closure(1),
                    pynini.acceptor(
                        "<PREF>", token_type=self.__syms.alphabet)).closure(),
                alphabet.closure(1),
                pynini.transducer("",
                                  "<+V> <Inf>",
                                  output_token_type=self.__syms.alphabet)),
            tmp).optimize()

        insert_ge = pynini.concat(
            pynini.concat(
                self.__syms.characters.closure(1),
                pynini.acceptor("<PREF>",
                                token_type=self.__syms.alphabet)).closure(),
            pynini.transducer("g e <PREF> <Ge>",
                              "",
                              input_token_type=self.__syms.alphabet),
            alphabet.closure(1)).optimize()

        inserted_ge = pynini.compose(
            pynini.compose(insert_ge, infinitives).project(),
            pynini.union(
                self.__syms.to_lower, self.__syms.categories,
                self.__syms.prefix_suffix_marker,
                pynini.acceptor(
                    "<Ge>",
                    token_type=self.__syms.alphabet)).closure()).optimize()

        return pynini.concat(
            pynini.transducer("",
                              "<Deriv_Stems>",
                              output_token_type=self.__syms.alphabet),
            pynini.compose(
                pynini.compose(
                    pynini.compose(
                        pynini.union(
                            alphabet,
                            pynini.acceptor("<PREF>",
                                            token_type=self.__syms.alphabet),
                            pynini.transducer("",
                                              "<Ge>",
                                              output_token_type=self.__syms.
                                              alphabet)).closure(),
                        inserted_ge),
                    pynini.union(
                        self.__syms.characters,
                        pynini.acceptor("<Ge>",
                                        token_type=self.__syms.alphabet),
                        pynini.transducer(
                            pynini.union(self.__syms.categories,
                                         self.__syms.prefix_suffix_marker),
                            "")).closure()),
                pynini.concat(
                    pynini.union(
                        self.__syms.characters,
                        pynini.acceptor("<Ge>",
                                        token_type=self.__syms.alphabet),
                    ).closure(1),
                    pynini.transducer("e n",
                                      "",
                                      input_token_type=self.__syms.alphabet))),
            pynini.acceptor("<V>", token_type=self.__syms.alphabet),
            pynini.transducer(
                "", "<deriv> <nativ>",
                output_token_type=self.__syms.alphabet)).optimize()
コード例 #26
0
ファイル: tutorial.py プロジェクト: Ulitochka/FOMA
import pynini

chars = ([chr(i) for i in range(1, 91)] + ["\\[", "\\]", "\\\\"] +
         [chr(i) for i in range(94, 256)])
sigma_star = pynini.union(*chars).closure()
sigma_star.optimize()

input_string = "Do you have Camembert or Edam?"  # Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>?
cheeses = ("Boursin", "Camembert", "Cheddar", "Edam", "Gruyere", "Ilchester",
           "Jarlsberg", "Red Leicester", "Stilton")
output_string = "Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>"

fst_target = pynini.string_map(cheeses)
ltag = pynini.transducer("", "<cheese>")
rtag = pynini.transducer("", "</cheese>")
substitution = ltag + fst_target + rtag

rewrite = pynini.cdrewrite(substitution, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).stringify()

#######################################################################################################################

singular_map = pynini.union(
    pynini.transducer("feet", "foot"),
    pynini.transducer("pence", "penny"),

    # Any sequence of bytes ending in "ches" strips the "es";
    # the last argument -1 is a "weight" that gives this analysis a higher priority, if it matches the input.
    sigma_star + pynini.transducer("ches", "ch", -1),

    # Any sequence of bytes ending in "s" strips the "s".
コード例 #27
0
#measure

back_vowel = pynini.union("u", "o", "a")
neutral_vowel = pynini.union("i", "e")
front_vowel = pynini.union("y", "ö", "ä")
vowel = pynini.union(back_vowel, neutral_vowel, front_vowel)
archiphoneme = pynini.union("A", "I", "E", "O", "U")
consonant = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                         "p", "q", "r", "s", "t", "v", "w", "x", "z")
sigma_star = pynini.union(vowel, consonant, archiphoneme).closure().optimize()

adessive = "llA"
intervener = pynini.union(consonant, neutral_vowel).closure()
adessive_harmony = (
    pynini.cdrewrite(pynini.transducer("A", "a"), back_vowel + intervener, "",
                     sigma_star) *
    pynini.cdrewrite(pynini.t("A", "ä"), "", "", sigma_star)).optimize()


def make_adessive(stem):
    return ((stem + adessive) * adessive_harmony).stringify()


make_adessive("training")

singular_map = pynini.union(
    pynini.transducer("feet", "foot"),
    pynini.transducer("pence", "penny"),
    # Any sequence of bytes ending in "ches" strips the "es";
    # the last argument -1 is a "weight" that gives this analysis
コード例 #28
0
ファイル: m1_finnish.py プロジェクト: bonham79/MP1-bonham79
    ("ä", "A"), ("ö", "O"), ("š", "S")
]) | pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p",
                  "q", "r", "s", "t", "v", "w", "x", "z", "u", "o", "a", "y",
                  "i", "e", "-")).closure().optimize()

rvregularize = (pynini.string_map([
    ("A", "ä"), ("O", "ö"), ("S", "š")
]) | pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p",
                  "q", "r", "s", "t", "v", "w", "x", "z", "u", "o", "a", "y",
                  "i", "e", "-")).closure().optimize()

######################FST for harmony in suffix####################################################
regular_state = closure_regular.optimize()
harmony_state = closure_harmony.optimize()

adessive_regular_transduce = pynini.transducer(
    "", adessive_regular)  #, output_token_type="utf8")
adessive_harmony_transduce = pynini.transducer(
    "", adessive_harmony)  #, output_token_type="utf8")
inessive_regular_transduce = pynini.transducer(
    "", inessive_regular)  #, output_token_type="utf8")
inessive_harmony_transduce = pynini.transducer(
    "", inessive_harmony)  #, output_token_type="utf8")

transducer_adessive_harmony = harmony_state + adessive_harmony_transduce
transducer_adessive_regular = regular_state + adessive_regular_transduce
transducer_inessive_harmony = harmony_state + inessive_harmony_transduce
transducer_inessive_regular = regular_state + inessive_regular_transduce

transducer_adessive_base = transducer_adessive_regular | transducer_adessive_harmony
transducer_inessive_base = transducer_inessive_regular | transducer_inessive_harmony
コード例 #29
0
    def __construct_participle_adj(self, tmp, sublexica):
        '''
    Stems for conversion of participles into adjectives
    '''
        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map([
                "<VPART>", "<VPREF>", "<PREF>", "<CONV>", "<SUFF>", "<NN>",
                "<ADJ>", "<V>", "<FT>"
            ],
                              input_token_type=self.__syms.alphabet,
                              output_token_type=self.__syms.alphabet).project(
                              )).closure().optimize()

        return pynini.concat(
            pynini.transducer("",
                              "<Base_Stems>",
                              output_token_type=self.__syms.alphabet),
            pynini.union(
                pynini.concat(
                    pynini.compose(
                        pynini.concat(
                            alphabet,
                            pynini.transducer(
                                "<V>",
                                "<+V>",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet),
                            pynini.acceptor(
                                "<zu>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                            pynini.acceptor("<PPast>",
                                            token_type=self.__syms.alphabet)),
                        pynini.compose(
                            tmp,
                            pynini.concat(
                                sublexica.nodef_to_null,
                                pynini.acceptor(
                                    "t", token_type=self.__syms.alphabet)))),
                    pynini.transducer("",
                                      "<ADJ>",
                                      output_token_type=self.__syms.alphabet),
                    pynini.transducer("<CONV>",
                                      "",
                                      input_token_type=self.__syms.alphabet),
                    pynini.transducer("",
                                      "<base> <nativ> <Adj+e>",
                                      output_token_type=self.__syms.alphabet)),
                pynini.concat(
                    pynini.compose(
                        pynini.concat(
                            alphabet,
                            pynini.transducer(
                                "<V>",
                                "<+V>",
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet),
                            pynini.acceptor(
                                "<zu>",
                                token_type=self.__syms.alphabet).closure(0, 1),
                            pynini.string_map(
                                ["<PPast>", "<PPres>"],
                                input_token_type=self.__syms.alphabet,
                                output_token_type=self.__syms.alphabet).
                            project()),
                        pynini.compose(
                            tmp,
                            pynini.concat(
                                sublexica.nodef_to_null,
                                pynini.acceptor(
                                    "e n", token_type=self.__syms.alphabet)
                                | pynini.acceptor(
                                    "n d", token_type=self.__syms.alphabet)))),
                    pynini.transducer("",
                                      "<ADJ>",
                                      output_token_type=self.__syms.alphabet),
                    pynini.transducer("<CONV>",
                                      "",
                                      input_token_type=self.__syms.alphabet),
                    pynini.transducer(
                        "",
                        "<base> <nativ> <Adj+>",
                        output_token_type=self.__syms.alphabet)))).optimize()
コード例 #30
0
#!/usr/bin/pytho
###Improved T9 decoder that is biased towards word strings due to intersection with word trasducer

####
#####To execute, run executable with either arguments (str) or (str, str))
####Feeding only a single argument will run decoder function as if string is T9 encoded
####Feeding two strings, with second str = "e" will run a T9 encoded on first string and return encoded function.

import pynini
import string
import sys
##Vocabulary
lm_char = pynini.Fst.read("t9.char.lm.4")
lm_word = pynini.Fst.read("t9.word.lm")
t9 = pynini.transducer("0", "[32]")
t9_relations = [
    "0", "1", "2abc", "3def", "4ghi", "5jkl", "6mno", "7pqrs", "8tuv", "9wxyz"
]

##Reading vocabulary into alphabet.
for i in range(10):
    for k in t9_relations[i]:
        t9 = pynini.union(pynini.transducer(str(i), str(k)), t9)
##Adding punctuation to vocabulary
for i in string.punctuation:
    t9 = t9 | pynini.transducer("1", "[" + str(ord(i)) + "]")
##Closure and optimization
t9.closure().optimize()
##Inverstion for decoding
encoder = pynini.invert(t9).optimize()