def create_priors(priors, isym, osym, code):
    """This function creates a linear FST 
    and adds a <sigma> (joker) symbol at the
    end as a place holder"""

    priors = priors.split(";")

    # init a trasducer
    f = fst.Fst()
    f.set_input_symbols(isym)
    f.set_output_symbols(osym)
    s0 = f.add_state()
    f.set_start(s0)
    old = s0
    sig = "<sigma>"
    
    # adding priors
    for j in range(len(priors)):
        new = f.add_state()
        f.add_arc(old, fst.Arc(code[priors[j]], code[priors[j]], fst.Weight(f.weight_type(), 1.0), new))
        old = new
    new = f.add_state()

    # adding <sigma>
    f.add_arc(old, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new))
    f.add_arc(new, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new))
    return f,new
Esempio n. 2
0
def generate_phone_sequence_recognition_wfst(n):
    """ generate a HMM to recognise any single phone sequence in the lexicon
    
    Args:
        n (int): states per phone HMM

    Returns:
        the constructed WFST
    
    """
    
    f = fst.Fst('log')
    
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    for i, phone in phone_table:
        if phone != '<eps>':
            tmp_state = f.add_state()
            weight = fst.Weight('log', -math.log(phone_table.num_symbols()))
            f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state))
            last_state = generate_phone_wfst(f, tmp_state, phone, n)
            f.set_final(last_state)
            weight = fst.Weight('log', -math.log(1))
            f.add_arc(last_state, fst.Arc(0, 0, weight, start_state))
            
    return f
Esempio n. 3
0
def generate_word_sequence_recognition_wfst(n):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM

    Returns:
        the constructed WFST
    
    """
    
    f = fst.Fst('log')
    
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    for _, word in word_table:
        if word != '<eps>':
            tmp_state = f.add_state()
            weight = fst.Weight('log', -math.log(word_table.num_symbols()))
            f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state))
            word_wfst = generate_word_wfst(f, tmp_state, word, n)
            weight = fst.Weight('log', -math.log(1.0))
            f.add_arc(list(word_wfst.states())[-1], fst.Arc(0, 0, weight, start_state))
        
    return f
Esempio n. 4
0
def generate_phone_wfst(f, start_state, phone, n):
    """
    Generate a WFST representing an n-state left-to-right phone HMM.
    
    Args:
        f (fst.Fst()): an FST object, assumed to exist already
        start_state (int): the index of the first state, assumed to exist already
        phone (str): the phone label 
        n (int): number of states of the HMM excluding start and end
        
    Returns:
        the final state of the FST
    """
    
    current_state = start_state
    out_label = phone_table.find(phone)
    
    for i in range(1, n+1):
        in_label = state_table.find('{}_{}'.format(phone, i))
        weight = fst.Weight('log', -math.log(0.1))
        f.add_arc(current_state, fst.Arc(in_label, 0, weight, current_state))
        new_state = f.add_state()
        weight = fst.Weight('log', -math.log(0.9))
        f.add_arc(current_state, fst.Arc(in_label, out_label, weight, new_state))
        current_state = new_state
    
    return current_state
def build_refiner(isyms_fname, refiner_fname):
    """build refiner
    this fst would help extract the 
    last two states (one last arc)
    of the machine
    """

    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    code = {}
    for ltr, c in input_syms:
        code[c]=ltr

    # build refiner
    refiner = fst.Fst()
    refiner.set_input_symbols(input_syms)
    refiner.set_output_symbols(input_syms)
    s0 = refiner.add_state()
    s1 = refiner.add_state()
    for c, ltr in code.items():
        if ltr == 0:
            continue
        if ltr < 100:
            refiner.add_arc(s0, fst.Arc(code[c], code["<epsilon>"], fst.Weight(refiner.weight_type(), 1.0), s0))
        refiner.add_arc(s0, fst.Arc(code[c], code[c], fst.Weight(refiner.weight_type(), 1.0), s1))
    refiner.set_start(s0)
    refiner.set_final(s1)

    # save refiner
    refiner.write(refiner_fname)
Esempio n. 6
0
def generate_WFST_final_probability(n,
                                    lex,
                                    weight_fwd,
                                    weight_self,
                                    weights_final,
                                    original=False):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
        weight_final (dict): word -> probability of final state
    Returns:
        the constructed WFST
    
    """

    f = fst.Fst('log')
    none_weight = fst.Weight('log', -math.log(1))

    lex = parse_lexicon(lex, original)

    word_table, phone_table, state_table = generate_symbols_table(lex, 3)
    output_table = generate_output_table(word_table, phone_table)

    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)

    for word, phone_list in lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state

            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)

            f.set_final(current_state)
            f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

        # final word state should be current state
        prob = weights_final[word]
        weight = fst.Weight('log', -math.log(prob))
        f.set_final(current_state, weight)


#         print(f"Current state: {current_state} for word {word} is prob {prob} with log prob{(weight)}")

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
def build_lm(dev_fname, isyms_fname, constraints, lattice_output, refiner_fname):
    """
    Make a lattice that maps
    lemmas and constraints (or priors) to 
    an inflected version
    """
    # rewrite constraints
    constraints = constraints.replace("_",";")
    
    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    s_fin = '</s>'
    code = {}
    for ltr, c in input_syms:
        code[c]=ltr

    # init the lattice
    f_big = fst.Fst()
    f_big.set_input_symbols(input_syms)
    f_big.set_output_symbols(input_syms)

    for line in open(dev_fname,'r').readlines():
        cns, lemma, inflection = line.split()[-3:]
        if cns == constraints:
            print(cns, lemma, inflection)
            # find idx that the strings diverge
            idx = 0
            for i, (lm, flc) in enumerate(zip(lemma, inflection)):
                if lm !=flc:
                    idx = i
                    break
            f, old= create_priors(cns, input_syms, input_syms, code)
            keep = old
            for j in range(idx,len(lemma)):            
                new = f.add_state()
                f.add_arc(old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new))
                old = new
            new = f.add_state()
            # the residual of the lemma is mapped to the inflection residual (indirectly)
            sym = lemma[idx:]+"_"+inflection[idx:]
            print(lemma, inflection, sym)
            f.add_arc(old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new))
            f.set_final(new)
            f_big.union(f)
            f_big = fst.determinize(f_big.rmepsilon())

    # add <sigma> state in the <sigma place holder>
    for c, ltr in code.items():
        if int(ltr)>1 and int(ltr)<36: # (hard coded) symbols of Runssian + 2 more
            f_big.add_arc(keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep))

    f_big.invert()
    # save lattice
    f_big.write(lattice_output)
Esempio n. 8
0
def creation_automata():
    transitions = {"s0": {"1:1:0": ["s0", "s1"], "2:3:1": ["s0", "s2"]}}
    # La methode iteritems appliquee a un dictionnaire permet de decomposer les differents "niveaux de profondeur" du dictionnaire en tableaux
    # iteritems appliquee a transitions transforme le dict en un tableau contenant les differentes transitions (la transition s0 a l'index 0, la transition s1 a l'index 1 etc...) puis pour chaque tableau de transition celui-ci contient encore 2 tableaux l'un pour le label (s_i a l'index 0) et l'autre pour la valeur associee (la chaine de caracteres contenant tous les arcs a l'index 1)
    # iteritems appliquee a arcs transforme le dict d'arcs en un tableau ou chaque cellule contient un arc et pour chaque cellule contenant un arc, il y a un tableau contenant a l'index 0 le label de l'arc et a l'index 1 la valeur de l'arc c'est a dire la liste de destinations
    # les etats de destinations sont contenus dans une liste donc il n'y a pas besoin d'utiliser la methode iteritems.

    for src_state_label, arcs in transitions.iteritems(
    ):  # parcours du 1er niveau du dict : les cles sont les labels des etats sources et les objets sont les arcs associes a ces etats sources
        add_automate_state(src_state_label)
        for arc_label, set_dsts_states in arcs.iteritems(
        ):  # parcours du 2eme niveau du dict : les cles sont les labels des arcs et les objets parcourus sont les listes d'etats de destination
            for dst_state_label in set_dsts_states:  # parcours du 3eme niveau du dict : le 3eme niveau n'est pas un dictionnaire mais une liste ce qui signifie que les etats ne sont pas indexes par une cle quelconque mais par un entier : les objets parcourus sont les etats de destination
                add_automate_state(dst_state_label)

    for state_label, arcs in transitions.iteritems():
        for arc_label, set_dsts_states in arcs.iteritems():
            chars = arc_label.split(':')
            for dst_state_label in set_dsts_states:
                automate.add_arc(
                    automate_states[state_label],
                    fst.Arc(int(chars[0]), int(chars[1]),
                            fst.Weight(automate.weight_type(), int(chars[2])),
                            automate_states[dst_state_label]))

    automate.set_start(automate_states['s0'])
    automate.set_final(automate_states['s2'],
                       fst.Weight(automate.weight_type(), 1.5))

    print(automate)

    # Generation du code LaTeX au format GraphViz

    # Affichage des noeuds avec leurs labels
    i = 0
    print("digraph G {")
    for state_label, state in automate_states.iteritems():
        index = state_label.split("s")[1]
        display_node = index + " [label = \"" + state_label + "\"]"
        i += 1
        print(display_node)

    # Affichage des arcs avec leurs labels
    for src_state_label, arcs in transitions.iteritems():
        src_index = src_state_label.split("s")[1]
        for arc_label, set_dsts_states in arcs.iteritems():
            for dst_state_label in set_dsts_states:
                dst_index = dst_state_label.split("s")[1]
                display_edge = src_index + "->" + dst_index + " [label = \"" + arc_label + "\"]"
                print(display_edge)

    print("}")

    return (automate)
Esempio n. 9
0
 def test_simple(self):
     f = fst.Fst()
     s0 = f.add_state()
     s1 = f.add_state()
     s2 = f.add_state()
     f.add_arc(s0, fst.Arc(1, 1, fst.Weight(f.weight_type(), 3.0), s1))
     f.add_arc(s0, fst.Arc(1, 1, fst.Weight.One(f.weight_type()), s2))
     f.set_start(s0)
     f.set_final(s2, fst.Weight(f.weight_type(), 1.5))
     # Test fst
     self.assertEqual(f.num_states(), 3)
     self.assertAlmostEqual(float(f.final(s2)), 1.5)
Esempio n. 10
0
def OpenFST_Automata_Example():
    f = fst.Fst()
    s0 = f.add_state()
    s1 = f.add_state()
    s2 = f.add_state()
    f.add_arc(s0, fst.Arc(1, 2, fst.Weight(f.weight_type(), 3.0), s1))
    f.add_arc(s0, fst.Arc(1, 3, fst.Weight.One(f.weight_type()), s2))
    f.add_arc(s1, fst.Arc(2, 1, fst.Weight(f.weight_type(), 1.0), s2))
    f.set_start(s0)
    f.set_final(s2, fst.Weight(f.weight_type(), 1.5))

    print(s0, s1, s2)
    print(f)
Esempio n. 11
0
def generate_phone_wfst(f, start_state, phone, n, state_table, phone_table,
                        weight_fwd, weight_self):
    """
    Generate a WFST representing an n-state left-to-right phone HMM.
    
    Args:
        f (fst.Fst()): an FST object, assumed to exist already
        start_state (int): the index of the first state, assumed to exist already
        phone (str): the phone label 
        n (int): number of states of the HMM
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
        
    Returns:
        the final state of the FST
    """

    current_state = start_state

    for i in range(1, n + 1):

        in_label = state_table.find('{}_{}'.format(phone, i))

        sl_weight = None if weight_self == None else fst.Weight(
            'log', -math.log(weight_self))  # weight for self-loop
        next_weight = None if weight_fwd == None else fst.Weight(
            'log', -math.log(weight_fwd))  # weight for forward

        # self-loop back to current state
        f.add_arc(current_state, fst.Arc(in_label, 0, sl_weight,
                                         current_state))

        # transition to next state

        # we want to output the phone label on the final state
        # note: if outputting words instead this code should be modified
        if i == n:
            out_label = phone_table.find(phone)
        else:
            out_label = 0  # output empty <eps> label

        next_state = f.add_state()
        #         next_weight = fst.Weight('log', -math.log(0.9)) # weight to next state
        f.add_arc(current_state,
                  fst.Arc(in_label, out_label, next_weight, next_state))

        current_state = next_state
    return current_state
Esempio n. 12
0
def Automata_Building(ref_string, levenshtein_distance, output_weight):
    dict_automata = Levenshtein_Automata_Dico(ref_string, levenshtein_distance)
    # print(dict_automata)

    label_initial_state = "0;0"
    label_final_state = str(len(ref_string)) + ";" + str(levenshtein_distance)
    # Une fois l'automate represente sous forme de dictionnaire, on cree l'automate grace aux fonctions de la librairie openfst

    # Creation de tous les etats de l'automate (etats source et de destination confondus)
    # La fonction add automate state cree un dictionnaire automate states dont les cles sont les labels des etats et les valeurs associees sont les etats crees grace a la fonction de creation d'etats d'openfst

    state_index = 1
    for src_label, set_arcs in dict_automata.iteritems():
        state_index = add_automate_state(src_label, state_index)
        for arc_label, dst_states in set_arcs.iteritems():
            for dst_label in dst_states:
                state_index = add_automate_state(dst_label, state_index)

    # print(automate_states)

    # # Creation des arcs de l'automate

    for src_label, set_arcs in dict_automata.iteritems():
        for arc_label, dst_states in set_arcs.iteritems():
            label_info = arc_label.split("::")
            transmitted_char = int(convertSymToLabel(label_info[0]))
            consummed_char = int(convertSymToLabel(label_info[1]))
            weight = int(label_info[2])
            src_state_index = automate_states[src_label][1]
            print(transmitted_char, consummed_char, weight)
            for dst_label in dst_states:
                # print(dst_label)
                dst_state_index = automate_states[dst_label][1]
                automate.add_arc(
                    src_state_index,
                    fst.Arc(transmitted_char, consummed_char,
                            fst.Weight(automate.weight_type(), weight),
                            dst_state_index))

    automate.set_start(automate_states[label_initial_state][1])
    automate.set_final(automate_states[label_final_state][1],
                       fst.Weight(automate.weight_type(), output_weight))
    automate.draw("automata.dot")
    print(automate)

    return (automate)
Esempio n. 13
0
def Automata_Building(ref_string, levenshtein_distance, output_weight):
    levenshtein_automata = {}
    levenshtein_automata = Levenshtein_Automata_Dico(ref_string,
                                                     levenshtein_distance)
    # print(levenshtein_automata)

    label_inital_state = "0;0"
    label_final_state = str(len(ref_string)) + ";" + str(levenshtein_distance)
    # Une fois l'automate represente sous forme de dictionnaire, on cree l'automate grace aux fonctions de la librairie openfst

    # Creation de tous les etats de l'automate (etats source et de destination confondus)
    # La fonction add automate state cree un dictionnaire automate states dont les cles sont les labels des etats et les valeurs associees sont les etats crees grace a la fonction de creation d'etats d'openfst

    for src_label, set_arcs in levenshtein_automata.iteritems():
        add_automate_state(src_label)
        for arc_label, set_dsts in set_arcs.iteritems():
            for dst_label in set_dsts:
                add_automate_state(dst_label)

    print(automate)
    # # Creation des arcs de l'automate

    for src_label, set_arcs in levenshtein_automata.iteritems():
        for arc_label, set_dsts in set_arcs.iteritems():
            transmitted_char = arc_label.split(":")[0]
            consummed_char = arc_label.split(":")[1]
            weight = arc_label.split(":")[2]
            print(transmitted_char, consummed_char, weight)
            for dst_label in set_dsts:
                automate.add_arc(
                    automate_states[src_label],
                    fst.Arc(int(convertSymToLabel(transmitted_char)),
                            int(convertSymToLabel(consummed_char)),
                            fst.Weight(automate.weight_type(), int(weight)),
                            automate_states[dst_label]))

    automate.set_start(automate_states[label_inital_state])
    automate.set_final(automate_states[label_final_state],
                       fst.Weight(automate.weight_type(), output_weight))
    automate.draw("automata.dot")
    print(automate)

    return (automate)
Esempio n. 14
0
def make_input_fst(query, pysym):
    f = fst.Fst()
    start = f.add_state()
    end = f.add_state()
    f.set_start(start)
    f.set_final(end, fst.Weight(f.weight_type(), 0.0))
    prev_state = start
    for ch in query:
        n = f.add_state()
        label = pysym[ch]
        f.add_arc(prev_state,
                  fst.Arc(label, label, fst.Weight(f.weight_type(), 0.0), n))
        prev_state = n
    f.add_arc(
        prev_state,
        fst.Arc(pysym['<eps>'], pysym['<eps>'],
                fst.Weight(f.weight_type(), 0.0), end))
    f.write('input.fst')
    return f
Esempio n. 15
0
def generate_word_sequence_recognition_wfst_test(n,
                                                 lex,
                                                 original=False,
                                                 weight_fwd=None,
                                                 weight_self=None):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
    Returns:
        the constructed WFST
    
    """
    if (weight_fwd != None and weight_self != None):
        f = fst.Fst('log')
        none_weight = fst.Weight('log', -math.log(1))
    else:
        f = fst.Fst()
        none_weight = None

    lex = parse_lexicon(lex, original)

    word_table, phone_table, state_table = generate_symbols_table(lex, 3)
    output_table = generate_output_table(word_table, phone_table)
    #     print('output_table: {}'.format(list(output_table)))
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    # -- make fst
    for word, phone_list in lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state
            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)
            f.set_final(current_state)


#             f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
Esempio n. 16
0
    def make_query(self, cns, lemma):

        cns = cns.split(";")
        lemma = list(lemma)
        q = cns + ["<sigma>"] + lemma + ["</s>"]
        f = fst.Fst()
        f.set_input_symbols(self.input_syms)
        f.set_output_symbols(self.input_syms)
        s0 = f.add_state()
        f.set_start(s0)
        old = s0
        for j in range(len(q)):
            new = f.add_state()
            f.add_arc(old, fst.Arc(self.code[q[j]], self.code[q[j]], fst.Weight(f.weight_type(), 1.0), new))
            old = new
        f.set_final(old)
        return f
Esempio n. 17
0
def SimpleAutomata():
    src_state_label = "0;0"
    src_state_index = automate.add_state()

    dst_state_label = "0;1"
    dst_state_index = automate.add_state()
    arc_label = "2:4:1"
    label_string = arc_label.split(":")
    consummed_char = 2  # int(label_string[0])
    transmitted_char = 4  # int(label_string[1])
    weight = 1  # int(label_string[2])

    automate.add_arc(
        src_state_index,
        fst.Arc(transmitted_char, consummed_char,
                fst.Weight(automate.weight_type(), weight), dst_state_index))

    print(automate)
Esempio n. 18
0
def add_arc_to_automate(src_state_label, dst_state_label, arc_label, automate,
                        states_dict):

    src_state_index = get_index(src_state_label, automate, states_dict)

    dst_state_index = get_index(dst_state_label, automate, states_dict)

    label_string = arc_label.split(":")
    # print(label_string[0], label_string[1], label_string[2])
    consummed_char = convertSymToLabel(label_string[0])
    # print(consummed_char)
    transmitted_char = convertSymToLabel(label_string[1])
    # print(transmitted_char)
    weight = int(label_string[2])
    # print(weight)

    automate.add_arc(
        src_state_index,
        fst.Arc(transmitted_char, consummed_char,
                fst.Weight(automate.weight_type(), weight), dst_state_index))
Esempio n. 19
0
def make_fst(word_sym, phone_sym, pydict_file):
    with open(pydict_file, 'r') as rp:
        f = fst.Fst()
        start = f.add_state()
        end = f.add_state()
        f.set_start(start)
        f.add_arc(start,
                  fst.Arc(phone_sym['<eps>'], word_sym['<s>'],
                          fst.Weight(f.weight_type(), 0.0), start))  # 自转
        f.add_arc(end,
                  fst.Arc(phone_sym['<eps>'], word_sym['</s>'],
                          fst.Weight(f.weight_type(), 0.0), end))  # 自转
        f.add_arc(end,
                  fst.Arc(phone_sym['<eps>'], word_sym['<eps>'],
                          fst.Weight(f.weight_type(), 0.0), start))  # 1 --> 0
        f.set_final(end, fst.Weight(f.weight_type(), 0.0))
        for l in rp.readlines():
            items = l.strip().split(' ')
            prev_state = start
            ilabel = phone_sym['<eps>']
            olabel = word_sym['<eps>']
            for i in range(len(items[0])):
                n = f.add_state()
                pych = items[0][i]
                chch = items[1]
                ilabel = phone_sym[pych]
                if (i == 0):
                    olabel = word_sym[chch]
                else:
                    olabel = word_sym['<eps>']
                f.add_arc(
                    prev_state,
                    fst.Arc(ilabel, olabel, fst.Weight(f.weight_type(), 0.0),
                            n))
                prev_state = n
            # connect the last state with end node
            f.add_arc(
                prev_state,
                fst.Arc(phone_sym['<eps>'], olabel,
                        fst.Weight(f.weight_type(), 0.0), end))
        return f
Esempio n. 20
0
def SimpleAutomata(ref_string, levenshtein_distance):
    final_dst_state_label = str(
        len(ref_string)) + ";" + str(levenshtein_distance)

    for consummed_char_number in range(len(ref_string) + 1):
        for operations_number in range(levenshtein_distance + 1):
            src_state_label = str(consummed_char_number) + ";" + str(
                operations_number)
            # print(str(consummed_char_number != len(ref_string)) + "-" + str(operations_number == levenshtein_distance))
            print(
                str(consummed_char_number == len(ref_string)) + "-" +
                str(operations_number == levenshtein_distance))

            if (consummed_char_number == (len(ref_string))
                    and operations_number == levenshtein_distance):
                final_dst_state_label = src_state_label
                print("output state")
            elif (consummed_char_number == (len(ref_string))
                  and operations_number != levenshtein_distance):
                insertion_dst_state_label = str(
                    consummed_char_number) + ";" + str(operations_number + 1)
                insertion_arc_label = "*:epsilon:1"
                add_arc_to_automate(src_state_label, insertion_dst_state_label,
                                    insertion_arc_label, automate, states_dict)
            elif (consummed_char_number != (len(ref_string))
                  and operations_number == levenshtein_distance):
                accepting_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number)
                print(accepting_dst_state_label)
                accepting_arc_label = ref_string[
                    consummed_char_number] + ":" + ref_string[
                        consummed_char_number] + ":" + str(0)
                add_arc_to_automate(src_state_label, accepting_dst_state_label,
                                    accepting_arc_label, automate, states_dict)
            else:
                accepting_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number)
                accepting_arc_label = ref_string[
                    consummed_char_number] + ":" + ref_string[
                        consummed_char_number] + ":" + str(0)
                add_arc_to_automate(src_state_label, accepting_dst_state_label,
                                    accepting_arc_label, automate, states_dict)

                deletion_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number +
                                                           1)
                deletion_arc_label = "epsilon:" + ref_string[
                    consummed_char_number] + ":" + str(1)
                add_arc_to_automate(src_state_label, deletion_dst_state_label,
                                    deletion_arc_label, automate, states_dict)

                substitution_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number +
                                                           1)
                substitution_arc_label = "*:" + ref_string[
                    consummed_char_number] + ":" + str(1)
                add_arc_to_automate(src_state_label,
                                    substitution_dst_state_label,
                                    substitution_arc_label, automate,
                                    states_dict)

                insertion_dst_state_label = str(
                    consummed_char_number) + ";" + str(operations_number + 1)
                insertion_arc_label = "*:" + ref_string[
                    consummed_char_number] + ":" + str(1)
                add_arc_to_automate(src_state_label, insertion_dst_state_label,
                                    insertion_arc_label, automate, states_dict)

    for nb_final_states in range(levenshtein_distance + 1):
        final_dst_state_label = str(
            len(ref_string)) + ";" + str(nb_final_states)
        automate.set_final(states_dict[final_dst_state_label],
                           fst.Weight(automate.weight_type(), 1.5))

    automate.draw("automata.dot")
    print(automate)
    return automate, states_dict
Esempio n. 21
0
def Levenshtein_Automata_Dico(ref_string, levenshtein_distance):
    # Creation des etats de l'automate
    dict_levenshtein_states = create_states_dico(ref_string,
                                                 levenshtein_distance)

    # Creation des arcs emergeants de chaque etat
    # Pour les poids on pose que : d = 0 si on consomme un caractere, et 1 si on consomme etoile ou epsilon (insertion, deletion, substitution)
    # Pour les caracteres consommes et emis, on considere que les caracteres de la chaine de reference sont les caracteres consommes et les caracteres de la chaine hypothese seront les caracteres emis

    automate = fst.Fst()
    automata = {}
    weights = [0, 1, 1, 1]
    arcs_labels = []
    dst_states = []
    automata_voc = ["epsilon", "*"]
    automata_voc.extend(ref_string)

    initial_state_index = automate.add_state()  # label2int("0;0", ref_string)
    final_state_index = automate.add_state()  # label2int("5;2", ref_string)
    automate.set_start(initial_state_index)
    automate.set_final(final_state_index,
                       fst.Weight(automate.weight_type(), 1.5))

    for state_label, state_index in dict_levenshtein_states.iteritems():
        nb_consummed_chars = int(
            state_label.split(";")[0])  # 1er caractere du label
        nb_elementary_operations = int(
            state_label.split(";")[1])  # 2nd caractere du label

        set_arcs = {}
        arcs_labels = []
        char_from_ref_str = ''
        if nb_consummed_chars == len(ref_string):
            char_from_ref_str = "epsilon"
        else:
            char_from_ref_str = ref_string[nb_consummed_chars]

        up_dst_label = str(nb_consummed_chars) + ";" + str(
            nb_elementary_operations + 1)
        up_dst_index = label2int(up_dst_label, ref_string)
        # print("up", up_dst_label)
        insertion_arc_label = "*" + ":" + "epsilon" + ":" + str(1)
        insertion_split = insertion_arc_label.split(":")
        insertion_consummed_char = convertSymToLabel(insertion_split[0])
        insertion_transmitted_char = convertSymToLabel(insertion_split[1])
        insertion_weight = convertSymToLabel(insertion_split[2])

        diag_dst_label = str(nb_consummed_chars +
                             1) + ";" + str(nb_elementary_operations + 1)
        diag_dst_index = label2int(diag_dst_label, ref_string)
        # print("diag", diag_dst_label)
        deletion_arc_label = "epsilon:" + char_from_ref_str + ":" + str(
            weights[1])
        deletion_split = deletion_arc_label.split(":")
        deletion_consummed_char = convertSymToLabel(deletion_split[0])
        deletion_transmitted_char = convertSymToLabel(deletion_split[1])
        deletion_weight = convertSymToLabel(deletion_split[2])

        substitution_arc_label = "*:" + char_from_ref_str + ":" + str(
            weights[1])
        substitution_split = substitution_arc_label.split(":")
        substitution_consummed_char = convertSymToLabel(substitution_split[0])
        substitution_transmitted_char = convertSymToLabel(
            substitution_split[1])
        substitution_weight = convertSymToLabel(substitution_split[2])

        right_dst_label = str(nb_consummed_chars +
                              1) + ";" + str(nb_elementary_operations)
        right_dst_index = label2int(right_dst_label, ref_string)
        # print("right", right_dst_label)
        accepting_arc_label = char_from_ref_str + ":" + char_from_ref_str + ":" + str(
            weights[0])
        accepting_split = accepting_arc_label.split(":")
        accepting_consummed_char = convertSymToLabel(accepting_split[0])
        accepting_transmitted_char = convertSymToLabel(accepting_split[1])
        accepting_weight = convertSymToLabel(accepting_split[2])

        is_last_column = nb_consummed_chars == len(
            ref_string
        )  # booleen renvoie true si le nombre de caracteres conssommes est egal a la longueur de la chaine et false sinon
        is_last_row = nb_elementary_operations == levenshtein_distance  # booleen renvoie true si le nombre d'operations elementaires est egal a la distance de levenshtein et false sinon
        if is_last_column and is_last_row:
            output_arc_label = "epsilon" + ":" + "epsilon" + ":" + str(0)
            set_arcs[output_arc_label] = []
        elif is_last_column:
            arcs_labels.append(insertion_arc_label)
            dst_states.append(up_dst_label)

            set_arcs[insertion_arc_label] = [up_dst_label]
            automate.add_arc(
                state_index,
                fst.Arc(insertion_consummed_char, insertion_transmitted_char,
                        fst.Weight(automate.weight_type(), insertion_weight),
                        up_dst_index))

        elif is_last_row:
            arcs_labels.append(accepting_arc_label)
            dst_states.append(right_dst_label)
            set_arcs[accepting_arc_label] = [right_dst_label]

            automate.add_arc(
                state_index,
                fst.Arc(accepting_consummed_char, accepting_transmitted_char,
                        fst.Weight(automate.weight_type(), accepting_weight),
                        right_dst_index))

        else:
            arcs_labels.append(accepting_arc_label)
            dst_states.append(right_dst_label)
            set_arcs[accepting_arc_label] = [right_dst_label]
            automate.add_arc(
                state_index,
                fst.Arc(accepting_consummed_char, accepting_transmitted_char,
                        fst.Weight(automate.weight_type(), accepting_weight),
                        right_dst_index))

            arcs_labels.append(deletion_arc_label)
            dst_states.append(diag_dst_label)
            set_arcs[deletion_arc_label] = [diag_dst_label]
            automate.add_arc(
                state_index,
                fst.Arc(deletion_consummed_char, deletion_transmitted_char,
                        fst.Weight(automate.weight_type(), deletion_weight),
                        diag_dst_index))

            arcs_labels.append(substitution_arc_label)
            dst_states.append(diag_dst_label)
            automate.add_arc(
                state_index,
                fst.Arc(
                    substitution_consummed_char, substitution_transmitted_char,
                    fst.Weight(automate.weight_type(), substitution_weight),
                    diag_dst_index))

            arcs_labels.append(insertion_arc_label)
            dst_states.append(up_dst_label)
            automate.add_arc(
                state_index,
                fst.Arc(insertion_consummed_char, insertion_transmitted_char,
                        fst.Weight(automate.weight_type(), insertion_weight),
                        up_dst_index))

            set_arcs[substitution_arc_label] = [
                diag_dst_label, up_dst_label
            ]  # insertion et substitution ont les memes labels d'arcs

        automata[state_label] = set_arcs

        # print(automata[state_label])

        # for idx in range(len(dst_states)):
        #     dst_state_label = dst_states[idx]
        #     dst_state_index = dict_levenshtein_states[dst_state_label]

        #     consummed_char = convertSymToLabel(char_from_ref_str)
        #     dst_states[idx]
        #     transmitted_char = info[1]
        #     weight = info[2]

        #     automate.add_arc(
        #             state_index,
        #             fst.Arc(
        #                 transmitted_char,
        #                 consummed_char,
        #                 fst.Weight(automate.weight_type(), weight),
        #                 dst_state_index)
        #         )

    print(automata)

    # Display Automata in LaTeX :

    return (automata)
Esempio n. 22
0
def build_lm(dev_fname, isyms_fname, constraints, lattice_output):
    """
    Make a lattice that maps
    lemmas and constraints (or priors) to 
    an inflected version
    """
    # rewrite constraints
    constraints = constraints.replace("_", ";")

    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    s_fin = '</s>'
    code = {}
    for ltr, c in input_syms:
        code[c] = ltr

    # init the lattice
    f_big = fst.Fst("log")
    f_big.set_input_symbols(input_syms)
    f_big.set_output_symbols(input_syms)

    for line in open(dev_fname, 'r').readlines(
    ):  # all possilbe inflections are added, regardless of the prior (applying the prior an make for a more effecifent computation)
        line = line.strip()
        lemma, inflection, cns = line.split("\t")[:-2]
        #print(lemma, inflection, cns)
        if cns == constraints:

            # comparing strings
            idx = 0
            lemma = lemma.split()
            inflection = inflection.split()
            for j, (lm, flc) in enumerate(zip(lemma, inflection)):
                if lm != flc:
                    idx = j
                    break

            f, old = create_priors(cns, input_syms, input_syms, code)
            keep = old
            for j in range(idx, len(lemma)):
                new = f.add_state()
                f.add_arc(
                    old,
                    fst.Arc(code[lemma[j]], code[lemma[j]],
                            fst.Weight(f.weight_type(), 1.0), new))
                old = new
            new = f.add_state()
            # the residual of the lemma is mapped to the inflection residual (indirectly)
            sym = "".join(lemma[idx:]) + "_" + "".join(inflection[idx:])
            f.add_arc(
                old,
                fst.Arc(code[sym], code[s_fin],
                        fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new))
            f.set_final(new)
            f_big.union(f)
            f_big = fst.determinize(f_big.rmepsilon())

    # add <sigma> state in the <sigma place holder>
    for c, ltr in code.items():
        if int(ltr) > 1 and int(
                ltr) < 51:  # (hard coded) symbols of Runssian + 2 more
            f_big.add_arc(
                keep,
                fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0),
                        keep))

    f_big.invert()
    # save lattice
    f_big.write(lattice_output)
Esempio n. 23
0
def Levenshtein_Automata_Dico(ref_string, levenshtein_distance):
    # Creation des etats de l'automate
    dict_levenshtein_states = create_states_dico(ref_string,
                                                 levenshtein_distance)

    # Creation des arcs emergeants de chaque etat
    # Pour les poids on pose que : d = 0 si on consomme un caractere, et 1 si on consomme etoile ou epsilon (insertion, deletion, substitution)
    # Pour les caracteres consommes et emis, on considere que les caracteres de la chaine de reference sont les caracteres consommes et les caracteres de la chaine hypothese seront les caracteres emis

    automata = {}
    weights = [0, 1, 1, 1]
    arcs_labels = []
    dst_states = []
    automata_voc = ["epsilon", "*"]
    automata_voc.extend(ref_string)

    for state_label, state_index in dict_levenshtein_states.iteritems():
        nb_consummed_chars = int(
            state_label.split(";")[0])  # 1er caractere du label
        nb_elementary_operations = int(
            state_label.split(";")[1])  # 2nd caractere du label

        set_arcs = {}
        arcs_labels = []
        char_from_ref_str = ''
        if nb_consummed_chars == len(ref_string):
            char_from_ref_str = "epsilon"
        else:
            char_from_ref_str = ref_string[nb_consummed_chars]

        up_dst_label = str(nb_consummed_chars) + ";" + str(
            nb_elementary_operations + 1)
        # print("up", up_dst_label)
        diag_dst_label = str(nb_consummed_chars +
                             1) + ";" + str(nb_elementary_operations + 1)
        # print("diag", diag_dst_label)
        right_dst_label = str(nb_consummed_chars +
                              1) + ";" + str(nb_elementary_operations)
        # print("right", right_dst_label)

        is_last_column = nb_consummed_chars == len(ref_string)
        is_last_row = nb_elementary_operations == levenshtein_distance
        if is_last_column and is_last_row:
            output_arc_label = "epsilon" + "::" + "epsilon" + "::" + str(0)
            set_arcs[output_arc_label] = []
        elif is_last_column:
            insertion_arc_label = "*" + "::" + "epsilon" + "::" + str(1)
            arcs_labels.append(insertion_arc_label)

            up_dst_label = str(nb_consummed_chars) + ";" + str(
                nb_elementary_operations + 1)
            dst_states.append(up_dst_label)

            set_arcs[insertion_arc_label] = [up_dst_label]

        elif is_last_row:
            accepting_arc_label = char_from_ref_str + "::" + char_from_ref_str + "::" + str(
                weights[0])
            arcs_labels.append(accepting_arc_label)

            right_dst_label = str(nb_consummed_chars +
                                  1) + ";" + str(nb_elementary_operations)
            dst_states.append(right_dst_label)

            set_arcs[accepting_arc_label] = [right_dst_label]

        else:
            accepting_arc_label = char_from_ref_str + "::" + char_from_ref_str + "::" + str(
                weights[0])
            deletion_arc_label = "epsilon::" + char_from_ref_str + "::" + str(
                weights[1])
            substitution_arc_label = "*::" + char_from_ref_str + "::" + str(
                weights[1])
            insertion_arc_label = substitution_arc_label
            arcs_labels.append(accepting_arc_label)
            arcs_labels.append(deletion_arc_label)
            arcs_labels.append(substitution_arc_label)
            arcs_labels.append(insertion_arc_label)

            dst_states.append(up_dst_label)
            dst_states.append(diag_dst_label)
            dst_states.append(diag_dst_label)
            dst_states.append(right_dst_label)

            set_arcs[accepting_arc_label] = [right_dst_label]
            set_arcs[deletion_arc_label] = [diag_dst_label]
            set_arcs[substitution_arc_label] = [diag_dst_label, up_dst_label]

        automata[state_label] = set_arcs
        # print(automata[state_label])

        for idx in range(len(dst_states)):
            dst_state_label = dst_states[idx]
            dst_state_index = dict_levenshtein_states[dst_state_label]

            consummed_char = convertSymToLabel(char_from_ref_str)
            dst_states[idx]
            transmitted_char = info[1]
            weight = info[2]

            automate.add_arc(
                state_index,
                fst.Arc(transmitted_char, consummed_char,
                        fst.Weight(automate.weight_type(), weight),
                        dst_state_index))

    print(automata)

    # Display Automata in LaTeX :

    return (automata)
Esempio n. 24
0
def generate_word_sequence_recognition_wfst_bigram(n,
                                                   lex,
                                                   df_bigram_prob,
                                                   original=False,
                                                   weight_fwd=None,
                                                   weight_self=None):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
    Returns:
        the constructed WFST
    
    """
    if (weight_fwd != None and weight_self != None):
        f = fst.Fst('log')
        none_weight = fst.Weight('log', -math.log(1))
    else:
        f = fst.Fst()
        none_weight = None
    lex = parse_lexicon(lex, original)
    word_table, phone_table, state_table = generate_symbols_table(lex, 3)
    output_table = generate_output_table(word_table, phone_table)
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    # -- dictionaries for initial and last states
    dict_initial = {}
    dict_final = {}
    # make fst
    for word, phone_list in lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            # -- add to initial dict
            if word in dict_initial:
                dict_initial[word].append(initial_state)
            else:
                dict_initial[word] = [initial_state]
            # -- add arcs
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state
            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)
            f.set_final(current_state)
            f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))
            # -- add to final dict
            if word in dict_final:
                dict_final[word].append(current_state)
            else:
                dict_final[word] = [current_state]
    # -- add bidirectional arcs
    for word, last_state_list in dict_final.items(
    ):  # list of final states 4 word
        for last_state in last_state_list:  # final state from lsit
            for word_bi, initial_state_list in dict_initial.items(
            ):  # list of initial satates
                for initial_state in initial_state_list:  # state from list
                    prob = df_bigram_prob['Word After', word_bi]['Word Before',
                                                                 word]
                    if (prob == 0):
                        prob = 1e10
                    else:
                        prob = -math.log(prob)
                    weight = fst.Weight('log', prob)
                    f.add_arc(
                        last_state,
                        fst.Arc(0, output_table.find(word_bi), weight,
                                initial_state))

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
Esempio n. 25
0
def generate_WFST_silent(n, lex, weight_fwd, weight_self, original=False):
    """ generate a HMM to recognise any single word sequence for words in the lexicon and includes a silence state
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
        weights_start (dict): word -> probability of word
    Returns:
        the constructed WFST
    
    """

    f = fst.Fst('log')
    none_weight = fst.Weight('log', -math.log(1))

    original_lex = parse_lexicon(lex, original)
    # add the silent states
    silent_word = '<silence>'
    silent_phones = ['sil_0', 'sil_1', 'sil_2', 'sil_3', 'sil_4', 'sil_5']
    silence_lex = original_lex.copy()
    silence_lex[silent_word] = [silent_phones
                                ]  # makes sure output table contains it
    # -----
    #     print(f"lex: {silence_lex}")
    word_table, phone_table, state_table = generate_symbols_table(
        original_lex, 3)
    word_table.add_symbol(silent_word)
    for phone in silent_phones:
        state_table.add_symbol(phone)
    phone_table.add_symbol('sil')

    #     print(f'state table: {list(state_table)}')
    output_table = generate_output_table(word_table, phone_table)

    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)

    # skip silent phones by using original lex
    for word, phone_list in original_lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state

            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)

            f.set_final(current_state)
            f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

    # need to add the silent state seperately
    current_state = f.add_state()
    f.add_arc(
        start_state,
        fst.Arc(0, output_table.find(silent_word), none_weight, current_state))
    current_state = generate_silent_phone_wfst(f, current_state, state_table,
                                               output_table)
    f.set_final(current_state)
    f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
Esempio n. 26
0
def generate_silent_phone_wfst(f, start_state, state_table, phone_table):
    """
    Generate a WFST representing an n-state left-to-right phone HMM.
    
    Args:
        f (fst.Fst()): an FST object, assumed to exist already
        start_state (int): the index of the first state, assumed to exist already
        phone (str): the phone label 
        n (int): number of states of the HMM
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
        
    Returns:
        the final state of the FST
    """
    #     print(f"states before silent: {list(f.states())}")
    current_state = start_state

    # start with creating the states
    n = 5
    for i in range(1, n + 1):
        current_state = f.add_state()

    WFST_silent = list(f.states())[-(n + 1):]

    # manually make the ergodic topology
    s0 = WFST_silent[0]
    s0_label = state_table.find('sil_1')

    s1 = WFST_silent[1]
    s1_label = state_table.find('sil_2')

    s2 = WFST_silent[2]
    s2_label = state_table.find('sil_3')

    s3 = WFST_silent[3]
    s3_label = state_table.find('sil_4')

    s4 = WFST_silent[4]
    s4_label = state_table.find('sil_5')

    # create arcs
    # s0
    f.add_arc(s0, fst.Arc(s0_label, 0, fst.Weight('log', -math.log(0.5)), s0))
    f.add_arc(s0, fst.Arc(s0_label, 0, fst.Weight('log', -math.log(0.5)), s1))
    # s1
    f.add_arc(s1,
              fst.Arc(s1_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s1))
    f.add_arc(s1,
              fst.Arc(s1_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s2))
    f.add_arc(s1,
              fst.Arc(s1_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s3))
    #s2
    f.add_arc(s2,
              fst.Arc(s2_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s1))
    f.add_arc(s2,
              fst.Arc(s2_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s2))
    f.add_arc(s2,
              fst.Arc(s2_label, 0, fst.Weight('log', -math.log(1 / 3.0)), s3))
    #s3
    f.add_arc(s3,
              fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s1))
    f.add_arc(s3,
              fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s2))
    f.add_arc(s3,
              fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s3))
    f.add_arc(s3,
              fst.Arc(s3_label, 0, fst.Weight('log', -math.log(1 / 4.0)), s4))
    # s4
    f.add_arc(s4, fst.Arc(s4_label, 0, fst.Weight('log', -math.log(0.5)), s4))
    f.add_arc(
        s4,
        fst.Arc(s4_label, 0, fst.Weight('log', -math.log(0.5)), current_state))

    #     print(f"silent states: {WFST_silent}")
    return current_state